In [90]:
import os
import settings
import pandas as pd
import operator

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [95]:
def read_data():
    df = pd.read_excel(os.path.join(settings.PROCESSED_DIR, "all_with_liwc_segmented.xls"), encoding="ISO-8859-1")
    return df

def create_test_set(df):
    predictors = df.columns.tolist()
    predictors = [p for p in predictors if p not in settings.NON_PREDICTORS]
    X_train, X_test, y_train, y_test = train_test_split(df[predictors], df[settings.TARGET], random_state = 42)
    return X_train, X_test, y_train, y_test


def compute_error(target, predictions):
    return mean_squared_error(target, predictions)

def sort_important_features(df):
    dt = DecisionTreeRegressor()
    predictors = df.columns.tolist()
    predictors = [p for p in predictors if p not in settings.NON_PREDICTORS]
    dt.fit(df[predictors], df[settings.TARGET])
    predictions = dt.predict(df[predictors])
    results = {name: score for name, score in zip(predictors, dt.feature_importances_)}
    sorted_results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_results)
    accuracy = dt.score(df[predictors], df[settings.TARGET])
    print("Accuracy: {}".format(accuracy))

In [96]:
df = read_data()

In [97]:
X_train, X_test, y_train, y_test = create_test_set(df)

In [98]:
sort_important_features(df)

[('risk', 0.073474394579168809), ('published_date', 0.07062221581144798), ('verb', 0.057852021874306638), ('posemo_change_q', 0.055980878170922212), ('filler', 0.046319516573587849), ('home', 0.040213357263335872), ('negate', 0.031849880540929271), ('negemo_4q', 0.02990427465167907), ('i', 0.029806043261614189), ('number', 0.029044097419533836), ('money', 0.028722939110058087), ('power', 0.026134150109493423), ('Authority', 0.025466443796991374), ('leisure', 0.024825651579089326), ('posemo_change_h', 0.020890899682812179), ('discrep', 0.019698670120846262), ('Moral', 0.018377141484917627), ('WC', 0.016495735395851804), ('ipron', 0.012843144097010682), ('social', 0.010523253868492842), ('achieve', 0.010285682703460147), ('quant', 0.010016548683207), ('drives', 0.0077525218317151638), ('interrog', 0.0077408316796077018), ('differ', 0.0076044250240186947), ('negemo_change_h', 0.0071831326961071147), ('percept', 0.0070179116308824886), ('article', 0.0069967984628870267), ('compare', 0.0069

In [99]:
sort_important_features(df)

[('risk', 0.072793906593524541), ('published_date', 0.070839523627815856), ('verb', 0.057947385255419849), ('posemo_change_q', 0.056059451349545882), ('you', 0.047660097335756581), ('home', 0.04003415710137697), ('negate', 0.032119816539563459), ('i', 0.030029714845729069), ('negemo_4q', 0.029722482141877282), ('money', 0.029599816681203962), ('number', 0.028653119923373372), ('power', 0.025963800198250647), ('Authority', 0.025061142378878333), ('leisure', 0.025046680540016863), ('posemo_change_h', 0.021035324771943906), ('cause', 0.019531754735901012), ('Moral', 0.018625837570546432), ('adverb', 0.016093250342250509), ('WC', 0.015443459568411547), ('ipron', 0.01118861718746871), ('achieve', 0.010538884573238164), ('differ', 0.0094079578088586413), ('quant', 0.009066600245634231), ('drives', 0.0087964797393525421), ('discrep', 0.0079689240074802235), ('we', 0.0077759544830932371), ('interrog', 0.0077672552498139419), ('negemo_change_h', 0.0077131515862566887), ('work', 0.00760652361322

In [100]:
sort_important_features(df)

[('risk', 0.072361793382002548), ('published_date', 0.071927458675133116), ('Authority', 0.070939311234900312), ('verb', 0.058402112432591947), ('posemo_change_q', 0.056185580636087369), ('home', 0.040206142977920772), ('negate', 0.03470791272815725), ('money', 0.032940992212498674), ('negemo_4q', 0.029632262234803197), ('i', 0.029416334881278865), ('number', 0.029269940276709327), ('power', 0.026349845222030172), ('leisure', 0.024719287701861836), ('posemo_change_h', 0.021138568451401291), ('discrep', 0.019927278339401926), ('Moral', 0.018199246857742251), ('WC', 0.015673545399200003), ('cause', 0.015086613582684131), ('ipron', 0.011567452321371186), ('achieve', 0.01025755975149339), ('negemo_change_h', 0.0099692467607989717), ('quant', 0.0090851729544308771), ('drives', 0.0085574463057390675), ('WPS', 0.0081102140220728081), ('interrog', 0.0080669893501295713), ('differ', 0.0076688563425492103), ('article', 0.0076300847843808377), ('percept', 0.007389689348415625), ('anger_1q', 0.007

1
('risk', 0.073474394579168809),
Cautionary tale

('published_date', 0.07062221581144798),

('verb', 0.057852021874306638),
Like good writing verbs matter

('posemo_change_q', 0.055980878170922212),
Change in positive emotion

('filler', 0.046319516573587849),
Um, hey, uh

('home', 0.040213357263335872), 
('negate', 0.031849880540929271),
('negemo_4q', 0.02990427465167907), 
('i', 0.029806043261614189), 
('number', 0.029044097419533836),
('money', 0.028722939110058087), 
('power', 0.026134150109493423),
('Authority', 0.025466443796991374),
('leisure', 0.024825651579089326),
('posemo_change_h', 0.020890899682812179),
('discrep', 0.019698670120846262), 
('Moral', 0.018377141484917627)

2
('risk', 0.072793906593524541), 
('published_date', 0.070839523627815856), 
('verb', 0.057947385255419849), 
('posemo_change_q', 0.056059451349545882),
('you', 0.047660097335756581), 
('home', 0.04003415710137697), 
('negate', 0.032119816539563459), 
('i', 0.030029714845729069), 
('negemo_4q', 0.029722482141877282),
('money', 0.029599816681203962),
('number', 0.028653119923373372), 
('power', 0.025963800198250647), 
('Authority', 0.025061142378878333),
('leisure', 0.025046680540016863),
('posemo_change_h', 0.021035324771943906), 
('cause', 0.019531754735901012), 
('Moral', 0.018625837570546432)

3
('risk', 0.072361793382002548),
('published_date', 0.071927458675133116),
('Authority', 0.070939311234900312), 
('verb', 0.058402112432591947), 
('posemo_change_q', 0.056185580636087369), 
('home', 0.040206142977920772), 
('negate', 0.03470791272815725),
('money', 0.032940992212498674),
('negemo_4q', 0.029632262234803197),
('i', 0.029416334881278865), 
('number', 0.029269940276709327), 
('power', 0.026349845222030172),
('leisure', 0.024719287701861836),
('posemo_change_h', 0.021138568451401291),
('discrep', 0.019927278339401926),
('Moral', 0.018199246857742251), 
('WC', 0.015673545399200003),


In [101]:
low = [('tentat', 0.0053865767580026976), ('sad_4q', 0.0053278385580807757), ('hear', 0.005211701580574349), ('affiliation', 0.0051260025751416896), ('swear', 0.0049985779409516843), ('anger_1q', 0.0049425140303599067), ('HarmVirtue', 0.004891816130978572), ('we', 0.004799190509483419), ('friend', 0.0047528634597467211), ('Sixltr', 0.0046954280936836647), ('netspeak', 0.0045467835228079553), ('health', 0.0044945311364598109), ('Dic', 0.0043686025594007671), ('anx_1q', 0.0043680365578748969), ('they', 0.0040827338018428047), ('body', 0.0040062435895162184), ('conj', 0.0039725621486206867), ('IngroupVirtue', 0.0038735681346123269), ('posemo_1q', 0.0036040186383375921), ('negemo_change_q', 0.0035786264182702801), ('prep', 0.0035141763505594828), ('informal', 0.0034708519548186565), ('see', 0.0033999082561921145), ('Analytic', 0.0033826522770605042), ('affect_change_h', 0.0032868094432886401), ('WPS', 0.0032598425908740639), ('you', 0.0031793752904590218), ('space', 0.0030719319161258162), ('focusfuture', 0.002850372373633045), ('Tone', 0.0026449567583547752), ('AuthorityVice', 0.0026179207797620454), ('focuspresent', 0.0024225619898505879), ('family', 0.0020455732877323754), ('affect', 0.0020389686136616696), ('reward', 0.0020064864664260541), ('ppron', 0.0020052321478085944), ('relig', 0.0019900370103944119), ('Authentic', 0.001922950850473952), ('nonflu', 0.0017768220294380377), ('certain', 0.0017010770229605333), ('sexual', 0.0016851305243786555), ('auxverb', 0.0015609686429837624), ('sad', 0.0014430399821373447), ('male', 0.0013055206014603541), ('anx', 0.0012691436070299906), ('HarmVice', 0.0012151831372337618), ('Purity', 0.0011678456796393041), ('sad_1q', 0.0011029556404667142), ('Harm', 0.0011014290882006776), ('shehe', 0.0010171517008699582), ('motion', 0.0010140982206936701), ('female', 0.00088983000670947541), ('Clout', 0.00088857724166929925), ('AuthorityVirtue', 0.00088582004913548198), ('bio', 0.00079536065761323172), ('adj', 0.0006295195398074439), ('feel', 0.00062027696814649506), ('anger', 0.0004724597253301095), ('PurityVice', 0.00043968251578002291), ('affect_change_q', 0.00043070468934247521), ('affect_4q', 0.0004053449943390199), ('IngroupVice', 0.00040396163570287374), ('affect_1q', 0.00036733803695353202), ('pronoun', 0.00034812691253552466), ('assent', 0.00032187646523085364), ('Fairness', 0.00022310852815522137), ('posemo_4q', 0.00020622672154547083), ('anger_4q', 0.00019769815826300806), ('PurityVirtue', 0.00010409136121980249), ('FairnessVice', 9.2224258996034231e-05), ('cogproc', 9.0269275338138017e-05), ('anx_4q', 7.0243952579134128e-05),('sad_4q', 0.0053942962777485462), ('HarmVirtue', 0.0051569263849859416), ('insight', 0.0050844034123471663), ('anger_1q', 0.0049295499196476923), ('Dic', 0.0042177809514961423), ('health', 0.0041919856999113317), ('Analytic', 0.0041124753436678416), ('affect', 0.0040204404103013599), ('IngroupVirtue', 0.0039257536929573414), ('posemo_1q', 0.0038980247349565305), ('body', 0.0038849898758612699), ('hear', 0.0038707643925611781), ('focusfuture', 0.0038390594859096733), ('friend', 0.0036960843758724075), ('affect_change_q', 0.003683143323166009), ('they', 0.0034332879874011358), ('WPS', 0.0033542209560350495), ('see', 0.0033443490242599798), ('informal', 0.0033207206417625087), ('negemo_change_q', 0.0032861303045450345), ('space', 0.0031326645280626278), ('affect_change_h', 0.0027307487069558743), ('Tone', 0.0026974460825549729), ('focuspresent', 0.0025299852802151972), ('swear', 0.0022165860123567727), ('ppron', 0.0021962378954014832), ('family', 0.0021557098484859373), ('sexual', 0.002107797520567192), ('nonflu', 0.0020346339816841434), ('AuthorityVice', 0.0019904850766532737), ('relig', 0.001978305629876621), ('certain', 0.0017994310021360392), ('Authentic', 0.0017994212508304086), ('reward', 0.0017556171928594462), ('tentat', 0.0017410085220342572), ('anger', 0.0016635504372487335), ('HarmVice', 0.0015110088527850543), ('male', 0.0014668285740785871), ('anx_1q', 0.0014159775673697594), ('AuthorityVirtue', 0.0013253767861361627), ('anx', 0.0012626822669829538), ('social', 0.0011544228317707913), ('Purity', 0.0011518162264632864), ('anx_4q', 0.0011053013274686832), ('motion', 0.0010576414641757593), ('adj', 0.0008996467481051459), ('shehe', 0.0007998385198028301), ('Clout', 0.00078976602407741369), ('feel', 0.0007317208065645435), ('Harm', 0.00070059584445587289), ('female', 0.00065452381833226833), ('auxverb', 0.00057630622516550393), ('assent', 0.00053027937647192745), ('bio', 0.00052181079530130891), ('sad_1q', 0.00051387203865247148), ('affect_4q', 0.00043016489942892898), ('PurityVice', 0.00040745080007905694), ('sad', 0.00031906865753090103), ('pronoun', 0.00030652852378634029), ('filler', 0.00029811787299067089), ('anger_4q', 0.00018633338783772169), ('cogproc', 0.00016198778142792524), ('netspeak', 0.00014019743433018347), ('affect_1q', 0.00010859955981436151), ('Fairness', 9.9742982383001696e-05), ('posemo_4q', 6.8205640072489072e-05), ('FairnessVice', 5.2496956556197241e-05), ('PurityVirtue', 4.7896519111429485e-05), ('IngroupVice', 1.6071146964681048e-05), ('Sixltr', 0.0051852671549559443), ('HarmVirtue', 0.0050237528976672229), ('Dic', 0.0048846215410330782), ('insight', 0.004858274561247748), ('negemo_1q', 0.0046013748552720833), ('IngroupVirtue', 0.0045646835826702184), ('we', 0.0044574962512234839), ('friend', 0.0044017775200105346), ('health', 0.0041885025573648611), ('affect', 0.0040355044922239219), ('body', 0.0039581797603742558), ('conj', 0.0039523135921805319), ('they', 0.0035932771540076111), ('prep', 0.0034972235875222052), ('space', 0.0034358361879882619), ('see', 0.0033627768361834975), ('relativ', 0.0032988541901197302), ('informal', 0.0032865988676952342), ('Analytic', 0.0032266475041801052), ('focusfuture', 0.0029772347620111218), ('swear', 0.0028539802861784918), ('hear', 0.0027385214589133971), ('reward', 0.0026293281333465185), ('affect_change_h', 0.0025172759549220176), ('negemo_change_q', 0.0023884161527477953), ('relig', 0.0023682926927271154), ('family', 0.0021447371529130965), ('Fairness', 0.0020355606306691359), ('Clout', 0.0018276087109331031), ('nonflu', 0.0018191866990768712), ('sexual', 0.0017945843618267878), ('you', 0.0016740400376939728), ('ppron', 0.0016410093885586166), ('Harm', 0.0016286481759070731), ('focuspresent', 0.0016161302021618828), ('sad', 0.0015221258789752297), ('anx_1q', 0.001478517605652519), ('AuthorityVirtue', 0.0014684161277431793), ('Tone', 0.0013411606693722585), ('AuthorityVice', 0.0013320167335257327), ('anx', 0.0012830662314739245), ('motion', 0.0012758288086499613), ('Authentic', 0.0012360024932374495), ('male', 0.0012150774462716103), ('HarmVice', 0.0011605643328505787), ('shehe', 0.0010506345374110019), ('anger_4q', 0.0010114509905799222), ('Purity', 0.00099180869946841966), ('social', 0.00098896713317120398), ('affect_change_q', 0.00097158999391127698), ('certain', 0.00080458093574328076), ('auxverb', 0.00067300021531426077), ('feel', 0.00066772243839919286), ('anger', 0.00065763432676771427), ('adj', 0.00058657778387491335), ('female', 0.00055108180854159168), ('sad_1q', 0.00047384991821953685), ('PurityVice', 0.0004174983440639313), ('bio', 0.00040806877017292233), ('cogproc', 0.00039400602167081095), ('affect_4q', 0.00037377661963421311), ('posemo_4q', 0.00032685274351924322), ('assent', 0.00030050191514169729), ('filler', 0.00024077494665044114), ('pronoun', 0.00022136135891147329), ('netspeak', 0.00021016576792164772), ('PurityVirtue', 0.00020828304289650576), ('affect_1q', 0.000128010799941917), ('anx_4q', 0.00012436309574417667), ('IngroupVice', 0.00012190608711989372), ('FairnessVice', 6.3244279997832941e-06)]

In [102]:
names = [variable for variable, score in low]
    

In [103]:
uniques = set(names)

In [105]:
unique_list = list(uniques)

In [107]:
NON_PREDICTORS = settings.NON_PREDICTORS + unique_list

In [108]:
def sort_important_features(df):
    rf = RandomForestRegressor()
    predictors = df.columns.tolist()
    predictors = [p for p in predictors if p not in NON_PREDICTORS]
    rf.fit(df[predictors], df[settings.TARGET])
    predictions = rf.predict(df[predictors])
    results = {name: score for name, score in zip(predictors, rf.feature_importances_)}
    sorted_results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_results)
    accuracy = rf.score(df[predictors], df[settings.TARGET])
    print("Accuracy: {}".format(accuracy))

In [109]:
sort_important_features(df)

[('published_date', 0.11203220861268645), ('money', 0.074213209901510285), ('risk', 0.064857083784047143), ('WC', 0.04944127622686055), ('drives', 0.03911664672174435), ('negate', 0.034532181071393761), ('time', 0.033326196212202529), ('leisure', 0.033102116358872044), ('posemo_change_h', 0.031808357307276727), ('posemo_change_q', 0.029533055711365969), ('verb', 0.026294854885633906), ('focuspast', 0.025535987736219123), ('i', 0.025216120909875444), ('power', 0.023766018827018634), ('negemo', 0.023528333732162142), ('home', 0.023213099454567093), ('percept', 0.022679171780002151), ('discrep', 0.021167913006984718), ('posemo', 0.021089647748747633), ('negemo_4q', 0.020713015460712612), ('quant', 0.020502907299419556), ('interrog', 0.017939213348217832), ('death', 0.017677098415354205), ('number', 0.01757041771992781), ('Moral', 0.017178320574287354), ('achieve', 0.016968220537124067), ('compare', 0.016634649516838754), ('adverb', 0.015885640951287666), ('work', 0.015351947446712014), ('

In [110]:
sort_important_features(df)

[('risk', 0.077634259914055606), ('published_date', 0.077337826128884207), ('posemo_change_q', 0.056097346400067305), ('money', 0.051110506767606), ('home', 0.046923967252406565), ('WC', 0.046562546493915113), ('negemo', 0.046036370725130532), ('drives', 0.042380240885860135), ('negate', 0.037230050898768449), ('Moral', 0.030124158077218178), ('number', 0.029451516962778829), ('verb', 0.029385195609824943), ('leisure', 0.028078655606332596), ('i', 0.026129553626563613), ('negemo_change_h', 0.02449124096272241), ('posemo_change_h', 0.023834680530953421), ('Authority', 0.023640719253006759), ('adverb', 0.023218943191236741), ('ipron', 0.021765432654005306), ('focuspast', 0.020961894315907746), ('discrep', 0.01885435697110599), ('Ingroup', 0.016997448633311762), ('interrog', 0.016894708721482025), ('differ', 0.016879442161058737), ('quant', 0.016800473072541556), ('percept', 0.016520010526866139), ('article', 0.016514582938325115), ('power', 0.015967116302781879), ('work', 0.0155295115259

In [111]:
sort_important_features(df)

[('published_date', 0.1091380329150746), ('risk', 0.090865539100594056), ('money', 0.06699190660819751), ('WC', 0.066485135760677508), ('home', 0.043422976607010305), ('posemo_change_q', 0.04222486383499606), ('posemo_change_h', 0.035206958829704135), ('verb', 0.034266955846992103), ('i', 0.032802109762449287), ('power', 0.031285970847873996), ('negate', 0.02739452442574819), ('MoralityGeneral', 0.024292148784740113), ('time', 0.023242098121748912), ('Moral', 0.021814261684810966), ('interrog', 0.021638010312245385), ('quant', 0.02064923723098739), ('leisure', 0.019446438062703051), ('negemo_change_h', 0.018979420416473981), ('focuspast', 0.018953484418988583), ('cause', 0.0168005473249019), ('percept', 0.016603269555190342), ('Ingroup', 0.016347145969471049), ('negemo', 0.015981681571073766), ('compare', 0.014999815253589146), ('differ', 0.014389615183625137), ('drives', 0.014236264758371678), ('work', 0.01380580023681599), ('negemo_4q', 0.013649309983500507), ('discrep', 0.0131994812

AND with all

In [112]:
def sort_important_features(df):
    rf = RandomForestRegressor()
    predictors = df.columns.tolist()
    predictors = [p for p in predictors if p not in settings.NON_PREDICTORS]
    rf.fit(df[predictors], df[settings.TARGET])
    predictions = rf.predict(df[predictors])
    results = {name: score for name, score in zip(predictors, rf.feature_importances_)}
    sorted_results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_results)
    accuracy = rf.score(df[predictors], df[settings.TARGET])
    print("Accuracy: {}".format(accuracy))

In [113]:
sort_important_features(df)

[('published_date', 0.088191379658326532), ('risk', 0.066234165795880401), ('money', 0.04426266616087516), ('WC', 0.031231085392603376), ('AuthorityVice', 0.030663830604213432), ('i', 0.029593088317319455), ('auxverb', 0.022935953084053294), ('certain', 0.022275628729651768), ('posemo_change_q', 0.021389076029200073), ('HarmVice', 0.021106651147428285), ('you', 0.019876270982885677), ('conj', 0.019448753260165867), ('focuspresent', 0.017172163721731488), ('adj', 0.016934337173227041), ('home', 0.016540196809874078), ('anger_1q', 0.016309775499885326), ('death', 0.016061457904749343), ('drives', 0.014797463741036434), ('Moral', 0.012874167512487563), ('leisure', 0.012299653649454441), ('ppron', 0.012271326409360227), ('Authentic', 0.012055573416021925), ('IngroupVirtue', 0.01196465739554026), ('nonflu', 0.011023201378735699), ('affect', 0.010543004906543092), ('power', 0.010201469847598607), ('health', 0.010076359587997177), ('cause', 0.0097219011300885179), ('social', 0.009347418318713

In [114]:
sort_important_features(df)

[('published_date', 0.08897397895574459), ('risk', 0.058163786937642771), ('money', 0.05173456679833828), ('WC', 0.041008456827099096), ('posemo_change_q', 0.031834077896883292), ('tentat', 0.024933558592640244), ('home', 0.024165532141339603), ('Moral', 0.021783105908599797), ('auxverb', 0.021265589186367125), ('negate', 0.020758101235156812), ('negemo', 0.020530878561817425), ('i', 0.017593881169773614), ('drives', 0.01645371577735635), ('posemo_change_h', 0.015547692347081202), ('power', 0.015361305903208103), ('anger_4q', 0.015271666136264539), ('HarmVirtue', 0.014622995095198103), ('work', 0.014471043341475623), ('adverb', 0.013666154426711949), ('focuspresent', 0.012855813790084412), ('leisure', 0.012263773778852243), ('number', 0.011093012350847134), ('we', 0.010585021083407479), ('affiliation', 0.01012182786741412), ('focuspast', 0.0099671136365107518), ('see', 0.00969722136607685), ('discrep', 0.0093340241694394258), ('affect_1q', 0.0092963129618442632), ('Authentic', 0.009271

In [115]:
sort_important_features(df)

[('published_date', 0.096897209492415731), ('risk', 0.085051985454712375), ('money', 0.058933322414695358), ('WC', 0.029355303889833852), ('auxverb', 0.021103928813743313), ('AuthorityVice', 0.02073752057389075), ('we', 0.020641534656066308), ('drives', 0.020110900755774239), ('time', 0.019253713664896623), ('space', 0.018198022492345721), ('percept', 0.017616457583202107), ('certain', 0.016864497174630323), ('interrog', 0.016841291446110525), ('Dic', 0.01620916567349414), ('focuspast', 0.015340832653116521), ('negemo_change_h', 0.0152986782497057), ('Moral', 0.015072158415991604), ('leisure', 0.014729223293855075), ('prep', 0.014552202394179471), ('FairnessVirtue', 0.014039647854517986), ('AuthorityVirtue', 0.012794820865851184), ('ipron', 0.011652513975038234), ('posemo_change_q', 0.011551173900748099), ('negate', 0.011376052504934574), ('sexual', 0.011341934393999014), ('affiliation', 0.010464936135773104), ('relativ', 0.010132028744716562), ('see', 0.0098828193936841922), ('focuspr

In [116]:
my_columns= list(df.columns)

In [117]:
my_columns

['index',
 'comments',
 'description',
 'duration',
 'event',
 'film_date',
 'languages',
 'main_speaker',
 'name',
 'published_date',
 'ratings',
 'related_talks',
 'speaker_occupation',
 'tags',
 'title',
 'url',
 'views',
 'music',
 'conversation',
 'transcript',
 'persuasive',
 'inspiring',
 'unconvincing',
 'applause',
 'laughter',
 'norm_persuasive',
 'norm_inspiring',
 'norm_unconvincing',
 'transcript_1sthalf',
 'transcript_2ndhalf',
 'transcript_1q',
 'transcript_2q',
 'transcript_3q',
 'transcript_4q',
 'WC',
 'Analytic',
 'Clout',
 'Authentic',
 'Tone',
 'WPS',
 'Sixltr',
 'Dic',
 'function',
 'pronoun',
 'ppron',
 'i',
 'we',
 'you',
 'shehe',
 'they',
 'ipron',
 'article',
 'prep',
 'auxverb',
 'adverb',
 'conj',
 'negate',
 'verb',
 'adj',
 'compare',
 'interrog',
 'number',
 'quant',
 'affect',
 'posemo',
 'negemo',
 'anx',
 'anger',
 'sad',
 'social',
 'family',
 'friend',
 'female',
 'male',
 'cogproc',
 'insight',
 'cause',
 'discrep',
 'tentat',
 'certain',
 'differ'

In [118]:
features = [
 'published_date',
 'laughter',
 'WC',
 'Analytic',
 'Clout',
 'Authentic',
 'Tone',
 'WPS',
 'Sixltr',
 'pronoun',
 'ppron',
 'i',
 'we',
 'you',
 'shehe',
 'they',
 'ipron',
 'article',
 'prep',
 'auxverb',
 'adverb',
 'conj',
 'negate',
 'verb',
 'adj',
 'compare',
 'interrog',
 'number',
 'quant',
 'affect',
 'posemo',
 'negemo',
 'anx',
 'anger',
 'sad',
 'social',
 'family',
 'friend',
 'female',
 'male',
 'cogproc',
 'insight',
 'cause',
 'discrep',
 'tentat',
 'certain',
 'differ',
 'percept',
 'see',
 'hear',
 'feel',
 'bio',
 'body',
 'health',
 'sexual',
 'ingest',
 'drives',
 'affiliation',
 'achieve',
 'power',
 'reward',
 'risk',
 'focuspast',
 'focuspresent',
 'focusfuture',
 'relativ',
 'motion',
 'space',
 'time',
 'work',
 'leisure',
 'home',
 'money',
 'relig',
 'death',
 'informal',
 'swear',
 'netspeak',
 'assent',
 'nonflu',
 'filler',
 'Moral',
 'HarmVirtue',
 'HarmVice',
 'FairnessVirtue',
 'FairnessVice',
 'IngroupVirtue',
 'IngroupVice',
 'AuthorityVirtue',
 'AuthorityVice',
 'PurityVirtue',
 'PurityVice',
 'MoralityGeneral',
 'affect_1h',
 'posemo_1h',
 'negemo_1h',
 'anx_1h',
 'anger_1h',
 'sad_1h',
 'affect_2h',
 'posemo_2h',
 'negemo_2h',
 'anx_2h',
 'anger_2h',
 'sad_2h',
 'affect_1q',
 'posemo_1q',
 'negemo_1q',
 'anx_1q',
 'anger_1q',
 'sad_1q',
 'affect_2q',
 'posemo_2q',
 'negemo_2q',
 'anx_2q',
 'anger_2q',
 'sad_2q',
 'affect_3q',
 'posemo_3q',
 'negemo_3q',
 'anx_3q',
 'anger_3q',
 'sad_3q',
 'affect_4q',
 'posemo_4q',
 'negemo_4q',
 'anx_4q',
 'anger_4q',
 'sad_4q',
 'posemo_change_h',
 'negemo_change_h',
 'affect_change_h',
 'posemo_change_q',
 'negemo_change_q',
 'affect_change_q',
 'Harm',
 'Fairness',
 'Purity',
 'Ingroup',
 'Authority']

In [119]:
def sort_important_features(df):
    dt = DecisionTreeRegressor()
#    predictors = df.columns.tolist()
#    predictors = [p for p in predictors if p not in settings.NON_PREDICTORS]
    predictors = features
    dt.fit(df[predictors], df[settings.TARGET])
    predictions = dt.predict(df[predictors])
    results = {name: score for name, score in zip(predictors, dt.feature_importances_)}
    sorted_results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_results)
    accuracy = dt.score(df[predictors], df[settings.TARGET])
    print("Accuracy: {}".format(accuracy))

In [121]:
sort_important_features(df)

[('risk', 0.070676738183284557), ('published_date', 0.070433010650798655), ('verb', 0.058203358697896723), ('posemo_change_q', 0.055602007598142496), ('social', 0.047455549236832838), ('home', 0.045212064118177561), ('negate', 0.03352663097038025), ('i', 0.031702506131511453), ('negemo_4q', 0.029498474529107425), ('money', 0.028593395847810274), ('number', 0.028451896106123305), ('Authority', 0.025461976155273167), ('power', 0.025191708386506063), ('leisure', 0.023753818077370946), ('Moral', 0.022111367869772587), ('posemo_change_h', 0.020987930441157657), ('adverb', 0.016754414631991215), ('WC', 0.014150621721499751), ('differ', 0.013137375511168475), ('ipron', 0.010811936868587405), ('quant', 0.010659189419182275), ('achieve', 0.010170217047126966), ('compare', 0.0081231950316628249), ('article', 0.0079601491822722231), ('drives', 0.007815907976702275), ('interrog', 0.0076044031967175754), ('percept', 0.0074157474717979344), ('time', 0.0071159499850220577), ('focuspast', 0.0070856323

In [122]:
sort_important_features(df)

[('published_date', 0.071601660917657642), ('risk', 0.070659801691833946), ('verb', 0.057881818460199685), ('posemo_change_q', 0.055486299557189103), ('hear', 0.048299058105642792), ('home', 0.041340986942566091), ('negate', 0.032292488454287824), ('i', 0.031452624140202182), ('negemo_4q', 0.030463441394481264), ('money', 0.030140079427416082), ('number', 0.027460013648905521), ('Authority', 0.02533685614004098), ('power', 0.025228904207769246), ('leisure', 0.023586506892656775), ('Moral', 0.022208079047514512), ('posemo_change_h', 0.02105731855538941), ('discrep', 0.018745906870683733), ('WC', 0.013245745382046058), ('cause', 0.013111002935752942), ('ipron', 0.010862263497930354), ('quant', 0.010440383114149122), ('achieve', 0.0096142376542722435), ('drives', 0.0081298499511393753), ('interrog', 0.0081218342716447272), ('ingest', 0.0078980756777805514), ('negemo_change_h', 0.0077711618763364595), ('compare', 0.0076722348583342882), ('WPS', 0.0076585762792185071), ('percept', 0.0076246

In [123]:
sort_important_features(df)

[('i', 0.076936944284107744), ('published_date', 0.070817985723397009), ('risk', 0.070587197321564349), ('verb', 0.05845240786751621), ('posemo_change_q', 0.055573690358420816), ('home', 0.041229999601991814), ('negate', 0.033866906140230399), ('negemo_4q', 0.029404013201846428), ('money', 0.02914938854487361), ('number', 0.027467805708806924), ('Moral', 0.026546498125891608), ('Authority', 0.025561419415837349), ('power', 0.025048961132015674), ('leisure', 0.023607299702897155), ('posemo_change_h', 0.020659710275630783), ('MoralityGeneral', 0.01819497664396454), ('WC', 0.014141758914045711), ('percept', 0.010957845221665955), ('ipron', 0.010912824167260717), ('quant', 0.010662379975260188), ('social', 0.010627453491935352), ('achieve', 0.0096757619836402211), ('drives', 0.0092341480063549784), ('compare', 0.0077121917338680524), ('article', 0.0072513326978744788), ('death', 0.0072058314331804561), ('interrog', 0.0070463423623780093), ('focuspast', 0.0067489766590402931), ('time', 0.00

In [124]:
def sort_important_features(df):
    rf = RandomForestRegressor()
    #predictors = df.columns.tolist()
    #predictors = [p for p in predictors if p not in settings.NON_PREDICTORS]
    predictors = features
    rf.fit(df[predictors], df[settings.TARGET])
    predictions = rf.predict(df[predictors])
    results = {name: score for name, score in zip(predictors, rf.feature_importances_)}
    sorted_results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_results)
    accuracy = rf.score(df[predictors], df[settings.TARGET])
    print("Accuracy: {}".format(accuracy))

In [125]:
sort_important_features(df)

[('risk', 0.085437769574385702), ('published_date', 0.070916516666291965), ('WC', 0.053934131666730369), ('money', 0.050298697761546454), ('posemo_change_q', 0.036525147773929026), ('negemo_4q', 0.034009045741315136), ('drives', 0.020364700244653689), ('AuthorityVice', 0.016036851059080884), ('focuspast', 0.015849015352172764), ('negemo_change_q', 0.015384161898508253), ('power', 0.015293954151364451), ('cause', 0.015003615416295749), ('negate', 0.014138560882136639), ('auxverb', 0.013336106087244545), ('time', 0.013018327147912883), ('home', 0.012887179048657054), ('affect_change_q', 0.012810056335026701), ('see', 0.012581265108308936), ('Ingroup', 0.012215278562204859), ('PurityVirtue', 0.01133642207204762), ('affect_change_h', 0.011251910654044813), ('certain', 0.011204218816673448), ('we', 0.011045306487367453), ('Authentic', 0.010742682764340416), ('percept', 0.010649200090109116), ('negemo', 0.010392618097647787), ('Moral', 0.010312592314151516), ('death', 0.0090877474988220984),

In [126]:
sort_important_features(df)

[('published_date', 0.068193006555488156), ('risk', 0.048405304672448503), ('home', 0.047259258361842915), ('money', 0.034533832752430479), ('posemo_change_q', 0.032784061638277601), ('WC', 0.032592847351664754), ('drives', 0.026432675406137751), ('assent', 0.025945240848537242), ('i', 0.022406150063023288), ('percept', 0.020860914697789466), ('we', 0.018964221348516764), ('insight', 0.018940242599965296), ('Moral', 0.018932722537384018), ('prep', 0.01826536729522258), ('affect_1h', 0.016575180361498791), ('leisure', 0.01633776435934289), ('discrep', 0.015611312385254502), ('verb', 0.014871810942511703), ('affect_1q', 0.014478267155869907), ('pronoun', 0.013625636738948788), ('female', 0.013507656786236782), ('auxverb', 0.012708921903606623), ('social', 0.012330659597833939), ('Clout', 0.012229631802885414), ('anx_2h', 0.010521162717857085), ('focuspresent', 0.0098592911773328091), ('article', 0.009319654691147445), ('time', 0.0092067509477971581), ('posemo_change_h', 0.008995675186495

## Linear

In [156]:
import os
import settings
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression


def read_data():
    df = pd.read_excel(os.path.join(settings.PROCESSED_DIR, "all_with_liwc_segmented.xls"), encoding="ISO-8859-1")
    return df



In [222]:
def create_summary(df):
    lr = LinearRegression()
    predictors = df.columns.tolist()
    predictors = ['i', 'negate', 'verb', 'interrog', 'risk', 'see', 'money', 'Moral']
    lr.fit(df[predictors], df['norm_persuasive'])

    X = df[predictors]
    X2 = sm.add_constant(X)
    est = sm.OLS(df['norm_persuasive'], X2)
    est2 = est.fit()
    print(est2.summary())

In [223]:
df = read_data()
create_summary(df)

                            OLS Regression Results                            
Dep. Variable:        norm_persuasive   R-squared:                       0.145
Model:                            OLS   Adj. R-squared:                  0.142
Method:                 Least Squares   F-statistic:                     50.70
Date:                Thu, 11 Jan 2018   Prob (F-statistic):           3.81e-76
Time:                        17:59:51   Log-Likelihood:                -15545.
No. Observations:                2406   AIC:                         3.111e+04
Df Residuals:                    2397   BIC:                         3.116e+04
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const        -36.9429     25.318     -1.459      0.1

## Look only and sentiment

In [234]:
def create_summary(df):
    lr = LinearRegression()
    predictors = df.columns.tolist()
    predictors = ['negemo']
    lr.fit(df[predictors], df['norm_persuasive'])

    X = df[predictors]
    X2 = sm.add_constant(X)
    est = sm.OLS(df['norm_persuasive'], X2)
    est2 = est.fit()
    print(est2.summary())

In [235]:
create_summary(df)

                            OLS Regression Results                            
Dep. Variable:        norm_persuasive   R-squared:                       0.025
Model:                            OLS   Adj. R-squared:                  0.024
Method:                 Least Squares   F-statistic:                     61.01
Date:                Thu, 11 Jan 2018   Prob (F-statistic):           8.42e-15
Time:                        18:31:04   Log-Likelihood:                -15703.
No. Observations:                2406   AIC:                         3.141e+04
Df Residuals:                    2404   BIC:                         3.142e+04
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         97.7353      6.073     16.094      0.0