# Imports

In [1]:
# import packages needed
import nltk
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from nltk.sentiment import SentimentIntensityAnalyzer
import scipy.stats
from collections import Counter
import re

In [2]:
# create some variables and objects needed
unwanted = nltk.corpus.stopwords.words("english")
names = nltk.corpus.names.words()
sia = SentimentIntensityAnalyzer()

# Setting up

In [3]:
# define the functions
# raw counts of words
def get_top_words_mean_std(corpus, N=None, drop_unwanted = True, unwanted = nltk.corpus.stopwords.words("english")):
    vec = CountVectorizer()
    bag_of_words = vec.fit_transform(corpus)
    std_words = bag_of_words.toarray().std(axis=0) 
    mean_words = bag_of_words.toarray().mean(axis=0)
    words_mean_std = [tuple(x) for x in zip(vec.get_feature_names(), mean_words, std_words)]
    words_mean_std = sorted(words_mean_std, key = lambda x: x[1], reverse=True)
    
    df = pd.DataFrame(words_mean_std, columns = ['word', 'avg_mentions', 'std_mentions']).sort_values('avg_mentions', ascending = False)
    if drop_unwanted == True:
        df = df[~df.word.isin(unwanted)]
    return df.head(n=N)

# binary count - 1 if it appears, 0 otherwise
def get_top_words_proportions(corpus, N=None, drop_unwanted = True, unwanted = nltk.corpus.stopwords.words("english")):
    vec = CountVectorizer(binary = True)
    bag_of_words = vec.fit_transform(corpus)
    proportion_words = bag_of_words.toarray().mean(axis=0)
    words_proportion = [tuple(x) for x in zip(vec.get_feature_names(), proportion_words)]
    words_proportion = sorted(words_proportion, key = lambda x: x[1], reverse=True)
    
    df = pd.DataFrame(words_proportion, columns = ['word', 'percent_containing']).sort_values('percent_containing', ascending = False)
    if drop_unwanted == True:
        df = df[~df.word.isin(unwanted)]
    return df.head(n=N)


def get_top_words_stats_df(df, corpus_col, N=None, drop_unwanted = True, unwanted = nltk.corpus.stopwords.words("english")):
    
    corpus = df[corpus_col].to_list()
    
    mean_std_df = get_top_words_mean_std(corpus, None, drop_unwanted, unwanted)
    proportion_df = get_top_words_proportions(corpus, None, drop_unwanted, unwanted)
    
    all_stats_df = mean_std_df.merge(proportion_df, on = 'word', how = 'outer').fillna(0)
    
    return all_stats_df.head(n=N)

def sentiment_of_word(df, corpus_col, word, sensitivity = .75):
        print('Word: ', word)
        
        corpus = df[corpus_col].to_list()
        
        count = 0
        pos_count = 0
        neg_count = 0
        neut_count = 0
        for entry in corpus:
            for sentence in nltk.sent_tokenize(entry):
                if ' ' + word + ' ' in sentence:
                    count += 1
                    if sia.polarity_scores(sentence)["compound"] > sensitivity:
                        pos_count += 1
                    elif sia.polarity_scores(sentence)["compound"] < -1 * sensitivity:
                        neg_count += 1
                    else:
                        neut_count += 1
        
        print('\tPercent of mentions that are positive: {0:.3f}'\
             .format(pos_count / count))
        
        print('\tPercent of mentions that are negative: {0:.3f}'\
             .format(neg_count / count))


def get_common_bigrams(df, corpus_col, N=None):
    corpus = df[corpus_col].to_list()
    tokenized_review = [nltk.word_tokenize(review) for review in corpus]
    cleaned_reviews = [[word for word in review if word not in unwanted\
        if word.isalpha()] for review in tokenized_review]
    
    bigrams = Counter({})

    for review in cleaned_reviews:
        bigrams += Counter(nltk.collocations.BigramCollocationFinder.from_words(review).ngram_fd.keys())
    
    bigram_df = pd.DataFrame(bigrams.most_common(N), columns = ['bigram', 'percent_containing'])
    bigram_df['percent_containing'] = bigram_df['percent_containing'] / df[corpus_col].count()
   
    bigram_df['bigram'] = bigram_df['bigram'].apply(lambda x: x[0] + ' ' + x[1])

    return bigram_df.sort_values('percent_containing', ascending = False).head(n=N)

 
def contains_theme(x, theme):
    if re.compile('|'.join(theme),re.IGNORECASE).search(x):
        return 1
    else:
        return 0
    
def get_theme_frequency(df, corpus_col, theme):
    temp = df
    temp['contains_theme'] = temp[corpus_col].apply(lambda x: contains_theme(x, theme))
    print('Theme words: ', theme)
    print('Percent of entries containing theme: {0:.3f}'.format(temp.contains_theme.mean()))
    
def sentiment_of_theme(df, corpus_col, theme, sensitivity = .75):
    print('Theme words: ', theme)
    corpus = df[corpus_col].to_list()
    count = 0
    pos_count = 0
    neg_count = 0
    neut_count = 0
    for entry in corpus:
        for sentence in nltk.sent_tokenize(entry):
            if re.compile('|'.join(theme),re.IGNORECASE).search(sentence):
                    count += 1
                    if sia.polarity_scores(sentence)["compound"] > sensitivity:
                        pos_count += 1
                    elif sia.polarity_scores(sentence)["compound"] < -1 * sensitivity:
                        neg_count += 1
                    else:
                        neut_count += 1
        
    print('\tPercent of mentions that are positive: {0:.3f}'\
         .format(pos_count / count))

    print('\tPercent of mentions that are negative: {0:.3f}'\
         .format(neg_count / count))

In [4]:
# create a dataframe to practice with
positive_review_ids = nltk.corpus.movie_reviews.fileids(categories=["pos"])
negative_review_ids = nltk.corpus.movie_reviews.fileids(categories=["neg"])
pos_df = pd.DataFrame(positive_review_ids, columns = ['review_id'])
pos_df['rating'] = 'pos'
neg_df = pd.DataFrame(negative_review_ids, columns = ['review_id'])
neg_df['rating'] = 'neg'
rating_df = pd.concat([pos_df, neg_df]).reset_index().drop('index', axis = 1)
rating_df['review_text'] = rating_df.review_id.apply(lambda x: nltk.corpus.movie_reviews.raw(x))

In [5]:
df = rating_df
df.head(n=5)

Unnamed: 0,review_id,rating,review_text
0,pos/cv000_29590.txt,pos,films adapted from comic books have had plenty...
1,pos/cv001_18431.txt,pos,every now and then a movie comes along from a ...
2,pos/cv002_15918.txt,pos,you've got mail works alot better than it dese...
3,pos/cv003_11664.txt,pos,""" jaws "" is a rare film that grabs your atten..."
4,pos/cv004_11636.txt,pos,moviemaking is a lot like being the general ma...


# Terms

### Frequency

Which words show up most in all reviews?

In [7]:
term_freqs = get_top_words_stats_df(df, 'review_text')
term_freqs.head(n=15)

Unnamed: 0,word,avg_mentions,std_mentions,percent_containing
0,film,4.7585,4.475173,0.8765
1,one,2.926,2.398234,0.888
2,movie,2.8855,3.23085,0.777
3,like,1.845,1.842274,0.7485
4,even,1.2825,1.40417,0.6455
5,good,1.2055,1.497087,0.591
6,time,1.2055,1.354352,0.629
7,story,1.0845,1.533414,0.539
8,would,1.0545,1.310927,0.5675
9,much,1.0245,1.231625,0.569


These seem like generic terms that we might expect in all reviews. Are there differences between the positive and negative reviews?

In [8]:
term_freqs_pos = get_top_words_stats_df(df[df.rating=='pos'], 'review_text')
term_freqs_pos.head(n=5)

Unnamed: 0,word,avg_mentions,std_mentions,percent_containing
0,film,5.23,4.846555,0.897
1,one,3.052,2.507448,0.899
2,movie,2.525,2.984858,0.739
3,like,1.802,1.891242,0.724
4,good,1.248,1.54094,0.596


In [9]:
term_freqs_neg = get_top_words_stats_df(df[df.rating=='neg'], 'review_text')
term_freqs_neg.head(n=5)

Unnamed: 0,word,avg_mentions,std_mentions,percent_containing
0,film,4.287,4.015051,0.856
1,movie,3.246,3.42162,0.815
2,one,2.8,2.27684,0.877
3,like,1.888,1.790937,0.773
4,even,1.386,1.447413,0.672


It seems like 'film' could be a signal for movies with positive ratings, while 'movie' may be used more in negative reviews. Where do the greatest absolute differences exist?

In [10]:
catch = term_freqs_pos.merge(term_freqs_neg, on = 'word', how = 'outer', suffixes = ['_pos', '_neg'])

In [11]:
catch['abs_diff'] = catch.apply(lambda x: np.abs(x[3] - x[6]), axis = 1)
catch[['word', 'abs_diff', 'percent_containing_pos', 'percent_containing_neg']].sort_values('abs_diff', ascending = False).head(15)

Unnamed: 0,word,abs_diff,percent_containing_pos,percent_containing_neg
108,bad,0.255,0.259,0.514
11,life,0.158,0.492,0.334
1367,worst,0.15,0.044,0.194
37,plot,0.142,0.375,0.517
7,also,0.137,0.604,0.467
144,script,0.129,0.209,0.338
28,great,0.125,0.411,0.286
21,best,0.124,0.489,0.365
32,world,0.122,0.363,0.241
1313,boring,0.121,0.048,0.169


Directional language is typically the top across absolute differences. Makes sense. Still, some other words are a signal (e.g. life, plot, script, and performances). Biopics may be the secret sauce, while bad writing is killer. What about relative differences?

In [12]:
catch['rel_diff'] = catch.apply(lambda x: np.abs(x[3] - x[6]) / max(x[3], x[6]), axis = 1)
catch[['word', 'rel_diff', 'percent_containing_pos', 'percent_containing_neg']].sort_values('rel_diff', ascending = False).head(5)

Unnamed: 0,word,rel_diff,percent_containing_pos,percent_containing_neg
3470,avoids,0.947368,0.019,0.001
3636,astounding,0.944444,0.018,0.001
3769,slip,0.941176,0.017,0.001
3864,fascination,0.9375,0.016,0.001
26402,3000,0.9375,0.001,0.016


This metric is dominated by small counts.

### Sentiment analysis

In [13]:
words_of_interest = ['actors', 'writer', 'style', 'moment', 'performances', 'plot', 'script']

In [14]:
for word in words_of_interest:
    sentiment_of_word(df, 'review_text', word)

Word:  actors
	Percent of mentions that are positive: 0.153
	Percent of mentions that are negative: 0.042
Word:  writer
	Percent of mentions that are positive: 0.139
	Percent of mentions that are negative: 0.066
Word:  style
	Percent of mentions that are positive: 0.114
	Percent of mentions that are negative: 0.042
Word:  moment
	Percent of mentions that are positive: 0.088
	Percent of mentions that are negative: 0.039
Word:  performances
	Percent of mentions that are positive: 0.231
	Percent of mentions that are negative: 0.043
Word:  plot
	Percent of mentions that are positive: 0.069
	Percent of mentions that are negative: 0.053
Word:  script
	Percent of mentions that are positive: 0.123
	Percent of mentions that are negative: 0.057


When reviewers are discussing actors and writers, there is a tendency to discuss them positively. Plot usage seems to be more balanced  We have a balanced positive / negative data set, but I wonder if usage in these reviews use words differently (i.e. more positive discussion in positive reviews).

In [15]:
for word in words_of_interest:
    sentiment_of_word(df[df.rating=='pos'], 'review_text', word)

Word:  actors
	Percent of mentions that are positive: 0.196
	Percent of mentions that are negative: 0.028
Word:  writer
	Percent of mentions that are positive: 0.181
	Percent of mentions that are negative: 0.028
Word:  style
	Percent of mentions that are positive: 0.115
	Percent of mentions that are negative: 0.030
Word:  moment
	Percent of mentions that are positive: 0.099
	Percent of mentions that are negative: 0.026
Word:  performances
	Percent of mentions that are positive: 0.288
	Percent of mentions that are negative: 0.018
Word:  plot
	Percent of mentions that are positive: 0.121
	Percent of mentions that are negative: 0.030
Word:  script
	Percent of mentions that are positive: 0.213
	Percent of mentions that are negative: 0.031


In [16]:
for word in words_of_interest:
    sentiment_of_word(df[df.rating=='neg'], 'review_text', word)

Word:  actors
	Percent of mentions that are positive: 0.112
	Percent of mentions that are negative: 0.055
Word:  writer
	Percent of mentions that are positive: 0.101
	Percent of mentions that are negative: 0.101
Word:  style
	Percent of mentions that are positive: 0.113
	Percent of mentions that are negative: 0.056
Word:  moment
	Percent of mentions that are positive: 0.078
	Percent of mentions that are negative: 0.052
Word:  performances
	Percent of mentions that are positive: 0.123
	Percent of mentions that are negative: 0.089
Word:  plot
	Percent of mentions that are positive: 0.035
	Percent of mentions that are negative: 0.068
Word:  script
	Percent of mentions that are positive: 0.067
	Percent of mentions that are negative: 0.074


Evene among negative reviews, some words get used positively pretty often. Still, as expected, positive reviews are more positive in their discussion of these key topics.


# Bigrams

Words often travel in pairs (e.g. baseball bat). Are there such trends in the reviews?

### Frequency

In [17]:
bigrams_df = get_common_bigrams(df,'review_text')
bigrams_df

Unnamed: 0,bigram,percent_containing
0,special effects,0.1180
1,even though,0.0955
2,new york,0.0800
3,looks like,0.0655
4,one best,0.0630
...,...,...
208878,less willing,0.0005
208877,radical less,0.0005
208876,struggle radical,0.0005
208875,lose struggle,0.0005


Special effects is the top bigram across all movies!

In [18]:
bigrams_pos_df = get_common_bigrams(df[df.rating=='pos'],'review_text')
bigrams_pos_df

Unnamed: 0,bigram,percent_containing
0,special effects,0.112
1,even though,0.105
2,one best,0.100
3,new york,0.084
4,takes place,0.068
...,...,...
114278,costumes alexandra,0.001
114279,byrne deserve,0.001
114280,deserve mention,0.001
114281,mention inportant,0.001


In [19]:
bigrams_neg_df = get_common_bigrams(df[df.rating=='neg'],'review_text')
bigrams_neg_df

Unnamed: 0,bigram,percent_containing
0,special effects,0.124
1,even though,0.086
2,looks like,0.083
3,new york,0.076
4,look like,0.065
...,...,...
100997,need bashed,0.001
100998,bashed head,0.001
100999,point bad,0.001
101000,guys symbolic,0.001


There may be some subtle top-bigram differences in positively and negatively received movies. Are there any large differnces, like we saw in term mentions?

In [20]:
catch = bigrams_pos_df.merge(bigrams_neg_df, on = 'bigram', how = 'outer', suffixes = ['_pos', '_neg'])

In [21]:
catch['abs_diff'] = catch.apply(lambda x: np.abs(x[1]-x[2]), axis = 1)

In [22]:
catch.sort_values('abs_diff', ascending = False).head(30)

Unnamed: 0,bigram,percent_containing_pos,percent_containing_neg,abs_diff
2,one best,0.1,0.026,0.074
916,bad movie,0.009,0.06,0.051
3468,waste time,0.005,0.046,0.041
18,looks like,0.048,0.083,0.035
6320,one worst,0.003,0.037,0.034
12,film also,0.051,0.018,0.033
88,action sequences,0.029,0.061,0.032
54,best films,0.035,0.006,0.029
473,even worse,0.013,0.042,0.029
46,look like,0.037,0.065,0.028


Some directional language, some thematic insights, and some raw content (e.g. star wars).

# Themes

Some words travel in pairs, but some pairs (or groups of 3+ words) have similar meanings. Do such groups occur more / less in different films?

### Frequency

In [23]:
plot_theme = ['plot', 'storyline', 'story', 'narrative', 'events', 'story line', 'story', 'diegesis']
comedy_theme = ['humor', 'humorous', 'funny', 'comedy', 'laugh', 'joke', 'goofy']

themes = [plot_theme, comedy_theme]

In [24]:
for theme in themes:
    get_theme_frequency(df, 'review_text', theme)
    print()

Theme words:  ['plot', 'storyline', 'story', 'narrative', 'events', 'story line', 'story', 'diegesis']
Percent of entries containing theme: 0.813

Theme words:  ['humor', 'humorous', 'funny', 'comedy', 'laugh', 'joke', 'goofy']
Percent of entries containing theme: 0.578



In [25]:
for theme in themes:
    get_theme_frequency(df.loc[df.rating=='pos'], 'review_text', theme)
    print()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Theme words:  ['plot', 'storyline', 'story', 'narrative', 'events', 'story line', 'story', 'diegesis']
Percent of entries containing theme: 0.793

Theme words:  ['humor', 'humorous', 'funny', 'comedy', 'laugh', 'joke', 'goofy']
Percent of entries containing theme: 0.530



In [26]:
for theme in themes:
    get_theme_frequency(df.loc[df.rating=='neg'], 'review_text', theme)
    print()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


Theme words:  ['plot', 'storyline', 'story', 'narrative', 'events', 'story line', 'story', 'diegesis']
Percent of entries containing theme: 0.833

Theme words:  ['humor', 'humorous', 'funny', 'comedy', 'laugh', 'joke', 'goofy']
Percent of entries containing theme: 0.626



Discussion of plot transcends film quality. However, attempts at humor may be ill-received.

###  Sentiment analysis

In [27]:
for theme in themes:
    sentiment_of_theme(df, 'review_text', theme)
    print()

Theme words:  ['plot', 'storyline', 'story', 'narrative', 'events', 'story line', 'story', 'diegesis']
	Percent of mentions that are positive: 0.102
	Percent of mentions that are negative: 0.055

Theme words:  ['humor', 'humorous', 'funny', 'comedy', 'laugh', 'joke', 'goofy']
	Percent of mentions that are positive: 0.207
	Percent of mentions that are negative: 0.026



While comedy occurs more in negative reviews, usage of this theme is typically positive at the highest level of aggregation. Do differences emerge when we filter?

In [28]:
for theme in themes:
    sentiment_of_theme(df.loc[df.rating=='pos'], 'review_text', theme)
    print()

Theme words:  ['plot', 'storyline', 'story', 'narrative', 'events', 'story line', 'story', 'diegesis']
	Percent of mentions that are positive: 0.143
	Percent of mentions that are negative: 0.045

Theme words:  ['humor', 'humorous', 'funny', 'comedy', 'laugh', 'joke', 'goofy']
	Percent of mentions that are positive: 0.300
	Percent of mentions that are negative: 0.019



In [29]:
for theme in themes:
    sentiment_of_theme(df.loc[df.rating=='neg'], 'review_text', theme)
    print()

Theme words:  ['plot', 'storyline', 'story', 'narrative', 'events', 'story line', 'story', 'diegesis']
	Percent of mentions that are positive: 0.058
	Percent of mentions that are negative: 0.066

Theme words:  ['humor', 'humorous', 'funny', 'comedy', 'laugh', 'joke', 'goofy']
	Percent of mentions that are positive: 0.132
	Percent of mentions that are negative: 0.031



Differences do, in fact, emerge. However, within negative reviews, a lot of mentions are still positive. This may be because we are using a built in sentiment analyzer and comedic themes are typically positive. Differneces across positive and negative review groups are insightful.

# Classification

Can we leverage the language used in review texts to identify positive and negative reviews?

In [30]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

In [31]:
catch = term_freqs_pos.merge(term_freqs_neg, on = 'word', how = 'outer', suffixes = ['_pos', '_neg'])
catch['abs_diff'] = catch.apply(lambda x: np.abs(x[3] - x[6]), axis = 1)
catch[['word', 'abs_diff', 'percent_containing_pos', 'percent_containing_neg']].sort_values('abs_diff', ascending = False).head(15)

Unnamed: 0,word,abs_diff,percent_containing_pos,percent_containing_neg
108,bad,0.255,0.259,0.514
11,life,0.158,0.492,0.334
1367,worst,0.15,0.044,0.194
37,plot,0.142,0.375,0.517
7,also,0.137,0.604,0.467
144,script,0.129,0.209,0.338
28,great,0.125,0.411,0.286
21,best,0.124,0.489,0.365
32,world,0.122,0.363,0.241
1313,boring,0.121,0.048,0.169


In [32]:
word_features = catch.sort_values('abs_diff', ascending = False).head(100).word.to_list()

In [33]:
def review_features(review):
    tokens = nltk.word_tokenize(review)
    features = {}
    for word in word_features:
        features['contains_{}'.format(word)] = (word in tokens)
    return features

df['tokenized'] = df.review_text.apply(lambda x: nltk.word_tokenize(x))

temp = df.copy()

for word in word_features:
        temp['contains_{}'.format(word)] = temp.review_text.str.contains(word)

X = temp.iloc[:,5:]
y = temp['rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

model = GaussianNB()
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [34]:
y_pred = model.predict(X_test)

confusion_matrix(y_test, y_pred)

accuracy_score(y_test, y_pred)

0.7893939393939394

Given a balanced data set, this not too shabby. Do other models with the same features do any better?

In [35]:
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Logit

temp = df.copy()
for word in word_features:
        temp['contains_{}'.format(word)] = temp.review_text.str.contains(word)

X = temp.iloc[:,5:]
X = sm.add_constant(X)
y = (temp['rating']=='pos')

y = np.array(y, dtype=float)
X = np.array(X, dtype=float)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

lr = Logit(y_train, X_train).fit()

Optimization terminated successfully.
         Current function value: 0.317750
         Iterations 8


In [36]:
y_pred = lr.predict(X_test) > .5

confusion_matrix(y_test, y_pred)

accuracy_score(y_test, y_pred)

0.8196969696969697

Logistic regression edges out Naiive Bayes! We also get to see how individual terms influence outcomes.

In [37]:
X = temp.iloc[:,5:]
X = sm.add_constant(X)
res_df=pd.DataFrame({'odds_ratio':(np.exp(lr.params).T).tolist(),'variable':X.columns.tolist()})
res_df=res_df.sort_values('odds_ratio', ascending=False)

pd.options.display.max_rows = 63
res_df.head(15)

Unnamed: 0,odds_ratio,variable
48,5.114084,contains_memorable
78,3.037992,contains_solid
47,3.013555,contains_excellent
50,2.999779,contains_hilarious
66,2.778813,contains_subtle
73,2.697125,contains_sometimes
49,2.523307,contains_perfectly
40,2.452022,contains_wonderful
28,2.392911,contains_quite
69,2.258487,contains_brilliant


In [38]:
res_df.tail(15)

Unnamed: 0,odds_ratio,variable
63,0.447695,contains_attempt
70,0.425336,contains_terrible
91,0.417692,contains_poorly
77,0.392694,contains_poor
60,0.390138,contains_dull
3,0.382662,contains_worst
12,0.37566,contains_nothing
41,0.364639,contains_wasted
20,0.324643,contains_unfortunately
18,0.303092,contains_supposed


Directional language is most insightful. Nothing too wild.

In [39]:
# ['const'] + temp.iloc[:,5:].columns.to_list()

Lets train again, using all of the data.

In [40]:
temp = df.copy()
for word in word_features:
        temp['contains_{}'.format(word)] = temp.review_text.str.contains(word)

X = temp.iloc[:,5:]
X = sm.add_constant(X)
y = (temp['rating']=='pos')

y = np.array(y, dtype=float)
X = pd.DataFrame(np.array(X, dtype=float))
X.columns = ['const'] + temp.iloc[:,5:].columns.to_list()

lr = Logit(y, X).fit()
lr.summary()

Optimization terminated successfully.
         Current function value: 0.338146
         Iterations 8


0,1,2,3
Dep. Variable:,y,No. Observations:,2000.0
Model:,Logit,Df Residuals:,1899.0
Method:,MLE,Df Model:,100.0
Date:,"Sun, 24 Oct 2021",Pseudo R-squ.:,0.5122
Time:,10:57:32,Log-Likelihood:,-676.29
converged:,True,LL-Null:,-1386.3
Covariance Type:,nonrobust,LLR p-value:,4.058e-232

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,-0.6115,0.268,-2.282,0.022,-1.137,-0.086
contains_bad,-0.6624,0.154,-4.296,0.000,-0.965,-0.360
contains_life,0.1157,0.147,0.786,0.432,-0.173,0.404
contains_worst,-1.1886,0.276,-4.310,0.000,-1.729,-0.648
contains_plot,-0.5559,0.148,-3.764,0.000,-0.845,-0.266
contains_also,0.5085,0.147,3.455,0.001,0.220,0.797
contains_script,-0.4893,0.159,-3.068,0.002,-0.802,-0.177
contains_great,0.4952,0.153,3.236,0.001,0.195,0.795
contains_best,0.2739,0.149,1.844,0.065,-0.017,0.565


# Category sentence level cooccurance

In [41]:
df['review_sents'] = df['review_text'].apply(lambda x: nltk.sent_tokenize(x))

In [42]:
def count_sents_with_theme(x, theme):
    ct = 0
    for sent in x:
        if contains_theme(sent, theme):
            ct += 1
    return ct

In [43]:
def count_sents_with_theme_condition(x, theme_condition_on, theme_to_count):
    ct = 0
    for sent in x:
        if (contains_theme(sent, theme_condition_on)) & (contains_theme(sent, theme_to_count)):
            ct += 1
    return ct

In [44]:
def theme_cooccur(df, text_col, theme_list):
    df['sent_col'] = df[text_col].apply(lambda x: nltk.sent_tokenize(x))
    df['sent_ct'] = df.sent_col.apply(lambda x: len(x))
    for theme in theme_list:
        df['{0}_sent_ct'.format(theme)] = df['sent_col'].apply(lambda x: count_sents_with_theme(x, theme))
                                                                                                
    for theme in theme_list:
            print('\n')
            print('Prob of {0}: '.format(theme), df['{0}_sent_ct'.format(theme)].sum() / df['sent_ct'].sum())
            for theme_2 in theme_list:
                if theme != theme_2:
                    df['{0}_{1}_sent_ct'.format(theme, theme_2)] = df['sent_col'].\
                        apply(lambda x: count_sents_with_theme_condition(x, theme, theme_2))
                    print('Prob of {0} given {1}:'.format(theme, theme_2),\
                      df['{0}_{1}_sent_ct'.format(theme, theme_2)].sum() / df['{0}_sent_ct'.format(theme_2)].sum())
                    
                    nobs = [df['{0}_sent_ct'.format(theme_2)].sum(), df['sent_ct'].sum()]
                    ct =  [df['{0}_{1}_sent_ct'.format(theme, theme_2)].sum(), df['{0}_sent_ct'.format(theme)].sum()]

In [45]:
theme_cooccur(df, 'review_text', [comedy_theme, ['bad', 'horrible', 'terrible', 'poor']])



Prob of ['humor', 'humorous', 'funny', 'comedy', 'laugh', 'joke', 'goofy']:  0.044902980484258795
Prob of ['humor', 'humorous', 'funny', 'comedy', 'laugh', 'joke', 'goofy'] given ['bad', 'horrible', 'terrible', 'poor']: 0.0683453237410072


Prob of ['bad', 'horrible', 'terrible', 'poor']:  0.027204607728009842
Prob of ['bad', 'horrible', 'terrible', 'poor'] given ['humor', 'humorous', 'funny', 'comedy', 'laugh', 'joke', 'goofy']: 0.04140722291407223


Mentions of comedy increase mentions of negative directional language and vice versa. Maybe a genre to avoid if you're in the business of making good films.

In [46]:
theme_cooccur(df, 'review_text', [['biography', 'life', 'biopic'], ['good', 'great', 'outstanding', 'excellent']])



Prob of ['biography', 'life', 'biopic']:  0.023863445730582116
Prob of ['biography', 'life', 'biopic'] given ['good', 'great', 'outstanding', 'excellent']: 0.02159880834160874


Prob of ['good', 'great', 'outstanding', 'excellent']:  0.05631046245037186
Prob of ['good', 'great', 'outstanding', 'excellent'] given ['biography', 'life', 'biopic']: 0.050966608084358524


Mentions of biopics do not increase usage of positive directional langauage.