## Use Decision Tree to find important features

In [62]:
import os
import settings
import pandas as pd
import operator

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

def read_data():
    df = pd.read_excel(os.path.join(settings.PROCESSED_DIR, "all_with_liwc_segmented.xls"), encoding="ISO-8859-1")
    return df

def create_test_set(df):
    predictors = df.columns.tolist()
    predictors = [p for p in predictors if p not in settings.NON_PREDICTORS]
    X_train, X_test, y_train, y_test = train_test_split(df[predictors], df[settings.TARGET], random_state = 42)
    return X_train, X_test, y_train, y_test


def compute_error(target, predictions):
    return mean_squared_error(target, predictions)

def sort_important_features(df):
    dt = DecisionTreeRegressor()
    predictors = df.columns.tolist()
    predictors = [p for p in predictors if p not in settings.NON_PREDICTORS]
    dt.fit(df[predictors], df[settings.TARGET])
    predictions = dt.predict(df[predictors])
    results = {name: score for name, score in zip(predictors, dt.feature_importances_)}
    sorted_results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_results)
    accuracy = dt.score(df[predictors], df[settings.TARGET])
    print("Accuracy: {}".format(accuracy))

In [63]:
df = read_data()


In [60]:
my_columns = list(df.columns)

In [61]:
my_columns

['index',
 'comments',
 'description',
 'duration',
 'event',
 'film_date',
 'languages',
 'main_speaker',
 'name',
 'published_date',
 'ratings',
 'related_talks',
 'speaker_occupation',
 'tags',
 'title',
 'url',
 'views',
 'music',
 'conversation',
 'transcript',
 'persuasive',
 'inspiring',
 'unconvincing',
 'applause',
 'laughter',
 'norm_persuasive',
 'norm_inspiring',
 'norm_unconvincing',
 'transcript_1sthalf',
 'transcript_2ndhalf',
 'transcript_1q',
 'transcript_2q',
 'transcript_3q',
 'transcript_4q',
 'WC',
 'Analytic',
 'Clout',
 'Authentic',
 'Tone',
 'WPS',
 'Sixltr',
 'Dic',
 'function',
 'pronoun',
 'ppron',
 'i',
 'we',
 'you',
 'shehe',
 'they',
 'ipron',
 'article',
 'prep',
 'auxverb',
 'adverb',
 'conj',
 'negate',
 'verb',
 'adj',
 'compare',
 'interrog',
 'number',
 'quant',
 'affect',
 'posemo',
 'negemo',
 'anx',
 'anger',
 'sad',
 'social',
 'family',
 'friend',
 'female',
 'male',
 'cogproc',
 'insight',
 'cause',
 'discrep',
 'tentat',
 'certain',
 'differ'

In [66]:
sort_important_features(df)
#error = compute_error(train[settings.TARGET], predictions)
#print("Mean

[('affect_3q', 0.24767939778357462), ('affect_2h', 0.051902914052529135), ('sad_2h', 0.044861395334479165), ('sad_2q', 0.03727171256467008), ('negemo_change_q', 0.030630402084302517), ('sad_1q', 0.029542181393185787), ('posemo_2q', 0.025064003201156357), ('posemo_1q', 0.023924123804876216), ('affect_2q', 0.023732582318629351), ('sad_4q', 0.022275102745884835), ('affect_change_q', 0.021477004929404089), ('negemo_2h', 0.02061250198037759), ('sad_3q', 0.020399253406737018), ('negemo_1q', 0.019655457068667823), ('anx_4q', 0.018955931154548137), ('anx_2q', 0.018561361798340507), ('negemo_change_h', 0.018446081935476144), ('posemo_change_q', 0.018402784150519415), ('affect', 0.016271679653991591), ('posemo_4q', 0.016149166684076822), ('anger_3q', 0.015261987217762523), ('anger_1q', 0.015207669055086828), ('anger_4q', 0.014981820276176689), ('negemo_1h', 0.014554626398596502), ('negemo', 0.014028244006859892), ('posemo_3q', 0.013383221224504827), ('posemo_change_h', 0.012935076107245917), ('a

In [41]:
sort_important_features(df)

[('pronoun', 0.22634634818911475), ('i', 0.068672997213791956), ('certain', 0.04325495587188042), ('Moral', 0.02852101570177205), ('power', 0.026907815745320033), ('social', 0.021986171176665004), ('family', 0.021631870503589234), ('anx', 0.020971902004808682), ('home', 0.020004964191056795), ('hear', 0.018972262637917018), ('achieve', 0.018557730950453662), ('see', 0.017184614080643989), ('sad', 0.017146105299756093), ('drives', 0.015683595205564074), ('Clout', 0.01522869320354435), ('negate', 0.013772219234653674), ('Sixltr', 0.013498405954291109), ('relig', 0.013291901736845788), ('adverb', 0.012478690056831507), ('leisure', 0.011919358602055653), ('posemo_change_q', 0.011717287831847004), ('money', 0.011535944710464761), ('motion', 0.010496123675528745), ('feel', 0.010416012988235201), ('Tone', 0.010081712008414194), ('we', 0.009151049871837031), ('negemo_change_q', 0.0089284562145158317), ('Authentic', 0.008653322125345822), ('article', 0.0086010031855257383), ('ingest', 0.0074355

In [65]:
sort_important_features(df)

[('affect_3q', 0.24534460235071412), ('affect_2h', 0.050387179185868228), ('sad_2q', 0.036108112624338898), ('anger_3q', 0.033267376326998208), ('negemo_change_q', 0.031550460352664962), ('sad_1q', 0.030250504983562299), ('posemo_2q', 0.027168361242812057), ('posemo_2h', 0.026845230817509072), ('sad_2h', 0.024395694165356484), ('affect_2q', 0.023739258627465894), ('negemo_2h', 0.022492286185180264), ('posemo_1q', 0.021903127266779648), ('affect_change_q', 0.02156833016393115), ('sad_3q', 0.019757853766344879), ('anx_2q', 0.019583959628207286), ('affect_change_h', 0.018478522382868547), ('posemo_change_q', 0.018411039665945036), ('negemo_change_h', 0.018278221626145782), ('negemo_1q', 0.018206046108368255), ('posemo_4q', 0.01739886517783485), ('anx_3q', 0.015582263208831057), ('negemo_1h', 0.015514055170166217), ('affect', 0.015377730738010459), ('anger_1q', 0.014422191046300908), ('posemo_3q', 0.014297530045065824), ('negemo', 0.014043694966321733), ('anx_1h', 0.01401191223704254), ('a

In [42]:
sort_important_features(df)

[('pronoun', 0.22802894834711945), ('i', 0.06876144905141171), ('female', 0.034920432870458422), ('achieve', 0.023854647146257485), ('power', 0.022889939679956596), ('social', 0.021835610085189618), ('family', 0.021611288921928512), ('bio', 0.020648078991488145), ('home', 0.020186806103548796), ('anx', 0.019828633696526456), ('hear', 0.017916222713103398), ('sad', 0.017150666081241994), ('see', 0.016998744046479011), ('drives', 0.01659613141468565), ('Sixltr', 0.013978268127291202), ('negate', 0.013160251346345119), ('posemo', 0.012958040291428706), ('adverb', 0.012891148696320175), ('relig', 0.012794886558833374), ('leisure', 0.011322657611379511), ('motion', 0.011050684800032852), ('certain', 0.010823116770160174), ('money', 0.010421546958164307), ('we', 0.010183277298246975), ('Moral', 0.01017760254872759), ('interrog', 0.0096411957225414607), ('Tone', 0.0094394011306123717), ('article', 0.0090674393634778278), ('feel', 0.0090461070112251643), ('Clout', 0.0088327742766426583), ('Aut

## Random Forest

In [47]:
import os
import settings
import pandas as pd
import operator

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [67]:
def sort_important_features(df):
    rf = RandomForestRegressor()
    predictors = df.columns.tolist()
    predictors = [p for p in predictors if p not in settings.NON_PREDICTORS]
    rf.fit(df[predictors], df[settings.TARGET])
    predictions = rf.predict(df[predictors])
    results = {name: score for name, score in zip(predictors, rf.feature_importances_)}
    sorted_results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_results)
    accuracy = rf.score(df[predictors], df[settings.TARGET])
    print("Accuracy: {}".format(accuracy))

In [73]:
sort_important_features(df)

[('ppron', 0.077284363968883357), ('sad_2q', 0.052759705845388208), ('sad_2h', 0.031062316393476098), ('power', 0.02727934116591433), ('relig', 0.025291346273700988), ('ipron', 0.025039169338970463), ('PurityVice', 0.024633451182361307), ('assent', 0.023920856884461994), ('i', 0.021375905112341609), ('achieve', 0.01759656862811541), ('focuspresent', 0.015644700196431119), ('pronoun', 0.0147020655297922), ('anger', 0.014419926539354644), ('discrep', 0.013934077805753542), ('certain', 0.013771491229118586), ('family', 0.013134186093759364), ('time', 0.011986549819546428), ('IngroupVirtue', 0.011912079047930698), ('money', 0.011380712409069724), ('male', 0.011323458687574634), ('work', 0.011267221335483028), ('HarmVirtue', 0.0110345702025125), ('focuspast', 0.01075353182095252), ('shehe', 0.010153923231439221), ('hear', 0.0098505821947659205), ('informal', 0.0097814092050460856), ('verb', 0.0095295932444915878), ('drives', 0.0094854189085428042), ('article', 0.0089216135586015852), ('prep

In [74]:
sort_important_features(df)

[('ppron', 0.087258990935289438), ('relativ', 0.031918633550889447), ('pronoun', 0.029326161028134529), ('focusfuture', 0.024155910758204585), ('body', 0.023097820769066725), ('relig', 0.021673372013243635), ('space', 0.020098837169368826), ('achieve', 0.019739268402759587), ('i', 0.019409001906020518), ('ipron', 0.018057223980820363), ('sad_2q', 0.017385323134079905), ('power', 0.017230255129271309), ('home', 0.016361483630985569), ('work', 0.015517345127553175), ('focuspast', 0.015483197421022507), ('family', 0.015324587734766392), ('quant', 0.014453449284125309), ('adverb', 0.013957537926590055), ('sad_1h', 0.012304675409291454), ('time', 0.012174149541101451), ('posemo_1q', 0.011459489229618036), ('health', 0.010961519069523746), ('negate', 0.010515954252460918), ('discrep', 0.01045996105344343), ('female', 0.010247759934705248), ('money', 0.0099096639703008717), ('affect_2q', 0.0092154211106461291), ('shehe', 0.0091343441945342489), ('friend', 0.0091032796476540542), ('see', 0.008

## Linear regression with Sentiment

In [53]:
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression

  from pandas.core import datetools


In [58]:
def create_summary(df):
    lr = LinearRegression()
    predictors = df.columns.tolist()
    predictors = ['affect_change_h']
    lr.fit(df[predictors], df['norm_inspiring'])

    X = df[predictors]
    X2 = sm.add_constant(X)
    est = sm.OLS(df['norm_inspiring'], X2)
    est2 = est.fit()
    print(est2.summary())

In [59]:
create_summary(df)

                            OLS Regression Results                            
Dep. Variable:         norm_inspiring   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     4.808
Date:                Thu, 11 Jan 2018   Prob (F-statistic):             0.0284
Time:                        15:34:16   Log-Likelihood:                -16925.
No. Observations:                2406   AIC:                         3.385e+04
Df Residuals:                    2404   BIC:                         3.387e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                      coef    std err          t      P>|t|      [0.025      0.975]
-----------------------------------------------------------------------------------
const             280.1395      5.939     

In [165]:
def create_summary(df):
    lr = LinearRegression()
    predictors = df.columns.tolist()
    predictors = ['we','sad_2q', 'power', 'relig', 'i', 'social', 'focuspast', 'relig', 'achieve', 'home', 'power', 'see']
    lr.fit(df[predictors], df['norm_inspiring'])

    X = df[predictors]
    X2 = sm.add_constant(X)
    est = sm.OLS(df['norm_inspiring'], X2)
    est2 = est.fit()
    print(est2.summary())

In [166]:
create_summary(df)

                            OLS Regression Results                            
Dep. Variable:         norm_inspiring   R-squared:                       0.126
Model:                            OLS   Adj. R-squared:                  0.123
Method:                 Least Squares   F-statistic:                     34.59
Date:                Thu, 11 Jan 2018   Prob (F-statistic):           1.57e-63
Time:                        17:15:38   Log-Likelihood:                -16766.
No. Observations:                2406   AIC:                         3.355e+04
Df Residuals:                    2395   BIC:                         3.362e+04
Df Model:                          10                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -62.2171     30.206     -2.060      0.0