In [90]:
import os
import settings
import pandas as pd
import operator

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

In [95]:
def read_data():
    df = pd.read_excel(os.path.join(settings.PROCESSED_DIR, "all_with_liwc_segmented.xls"), encoding="ISO-8859-1")
    return df

def create_test_set(df):
    predictors = df.columns.tolist()
    predictors = [p for p in predictors if p not in settings.NON_PREDICTORS]
    X_train, X_test, y_train, y_test = train_test_split(df[predictors], df[settings.TARGET], random_state = 42)
    return X_train, X_test, y_train, y_test


def compute_error(target, predictions):
    return mean_squared_error(target, predictions)

def sort_important_features(df):
    dt = DecisionTreeRegressor()
    predictors = df.columns.tolist()
    predictors = [p for p in predictors if p not in settings.NON_PREDICTORS]
    dt.fit(df[predictors], df[settings.TARGET])
    predictions = dt.predict(df[predictors])
    results = {name: score for name, score in zip(predictors, dt.feature_importances_)}
    sorted_results = sorted(results.items(), key=operator.itemgetter(1), reverse=True)
    print(sorted_results)
    accuracy = dt.score(df[predictors], df[settings.TARGET])
    print("Accuracy: {}".format(accuracy))

In [96]:
df = read_data()

In [97]:
X_train, X_test, y_train, y_test = create_test_set(df)

In [98]:
sort_important_features(df)

[('risk', 0.073474394579168809), ('published_date', 0.07062221581144798), ('verb', 0.057852021874306638), ('posemo_change_q', 0.055980878170922212), ('filler', 0.046319516573587849), ('home', 0.040213357263335872), ('negate', 0.031849880540929271), ('negemo_4q', 0.02990427465167907), ('i', 0.029806043261614189), ('number', 0.029044097419533836), ('money', 0.028722939110058087), ('power', 0.026134150109493423), ('Authority', 0.025466443796991374), ('leisure', 0.024825651579089326), ('posemo_change_h', 0.020890899682812179), ('discrep', 0.019698670120846262), ('Moral', 0.018377141484917627), ('WC', 0.016495735395851804), ('ipron', 0.012843144097010682), ('social', 0.010523253868492842), ('achieve', 0.010285682703460147), ('quant', 0.010016548683207), ('drives', 0.0077525218317151638), ('interrog', 0.0077408316796077018), ('differ', 0.0076044250240186947), ('negemo_change_h', 0.0071831326961071147), ('percept', 0.0070179116308824886), ('article', 0.0069967984628870267), ('compare', 0.0069

In [99]:
sort_important_features(df)

[('risk', 0.072793906593524541), ('published_date', 0.070839523627815856), ('verb', 0.057947385255419849), ('posemo_change_q', 0.056059451349545882), ('you', 0.047660097335756581), ('home', 0.04003415710137697), ('negate', 0.032119816539563459), ('i', 0.030029714845729069), ('negemo_4q', 0.029722482141877282), ('money', 0.029599816681203962), ('number', 0.028653119923373372), ('power', 0.025963800198250647), ('Authority', 0.025061142378878333), ('leisure', 0.025046680540016863), ('posemo_change_h', 0.021035324771943906), ('cause', 0.019531754735901012), ('Moral', 0.018625837570546432), ('adverb', 0.016093250342250509), ('WC', 0.015443459568411547), ('ipron', 0.01118861718746871), ('achieve', 0.010538884573238164), ('differ', 0.0094079578088586413), ('quant', 0.009066600245634231), ('drives', 0.0087964797393525421), ('discrep', 0.0079689240074802235), ('we', 0.0077759544830932371), ('interrog', 0.0077672552498139419), ('negemo_change_h', 0.0077131515862566887), ('work', 0.00760652361322

In [100]:
sort_important_features(df)

[('risk', 0.072361793382002548), ('published_date', 0.071927458675133116), ('Authority', 0.070939311234900312), ('verb', 0.058402112432591947), ('posemo_change_q', 0.056185580636087369), ('home', 0.040206142977920772), ('negate', 0.03470791272815725), ('money', 0.032940992212498674), ('negemo_4q', 0.029632262234803197), ('i', 0.029416334881278865), ('number', 0.029269940276709327), ('power', 0.026349845222030172), ('leisure', 0.024719287701861836), ('posemo_change_h', 0.021138568451401291), ('discrep', 0.019927278339401926), ('Moral', 0.018199246857742251), ('WC', 0.015673545399200003), ('cause', 0.015086613582684131), ('ipron', 0.011567452321371186), ('achieve', 0.01025755975149339), ('negemo_change_h', 0.0099692467607989717), ('quant', 0.0090851729544308771), ('drives', 0.0085574463057390675), ('WPS', 0.0081102140220728081), ('interrog', 0.0080669893501295713), ('differ', 0.0076688563425492103), ('article', 0.0076300847843808377), ('percept', 0.007389689348415625), ('anger_1q', 0.007

1
('risk', 0.073474394579168809),
Cautionary tale

('published_date', 0.07062221581144798),

('verb', 0.057852021874306638),
Like good writing verbs matter

('posemo_change_q', 0.055980878170922212),
Change in positive emotion

('filler', 0.046319516573587849),
Um, hey, uh

('home', 0.040213357263335872), 
('negate', 0.031849880540929271),
('negemo_4q', 0.02990427465167907), 
('i', 0.029806043261614189), 
('number', 0.029044097419533836),
('money', 0.028722939110058087), 
('power', 0.026134150109493423),
('Authority', 0.025466443796991374),
('leisure', 0.024825651579089326),
('posemo_change_h', 0.020890899682812179),
('discrep', 0.019698670120846262), 
('Moral', 0.018377141484917627)

2
('risk', 0.072793906593524541), 
('published_date', 0.070839523627815856), 
('verb', 0.057947385255419849), 
('posemo_change_q', 0.056059451349545882),
('you', 0.047660097335756581), 
('home', 0.04003415710137697), 
('negate', 0.032119816539563459), 
('i', 0.030029714845729069), 
('negemo_4q', 0.029722482141877282),
('money', 0.029599816681203962),
('number', 0.028653119923373372), 
('power', 0.025963800198250647), 
('Authority', 0.025061142378878333),
('leisure', 0.025046680540016863),
('posemo_change_h', 0.021035324771943906), 
('cause', 0.019531754735901012), 
('Moral', 0.018625837570546432)

3
('risk', 0.072361793382002548),
('published_date', 0.071927458675133116),
('Authority', 0.070939311234900312), 
('verb', 0.058402112432591947), 
('posemo_change_q', 0.056185580636087369), 
('home', 0.040206142977920772), 
('negate', 0.03470791272815725),
('money', 0.032940992212498674),
('negemo_4q', 0.029632262234803197),
('i', 0.029416334881278865), 
('number', 0.029269940276709327), 
('power', 0.026349845222030172),
('leisure', 0.024719287701861836),
('posemo_change_h', 0.021138568451401291),
('discrep', 0.019927278339401926),
('Moral', 0.018199246857742251), 
('WC', 0.015673545399200003),
