In [1]:
import pandas as pd
import joblib
!pip install eli5
import eli5
import matplotlib.pyplot as plt
from eli5.sklearn import PermutationImportance



In [2]:
import fake_news_spreader_feature_extraction as feature_extraction
from fake_news_spreader_feature_extraction import cleanPunc, clean_relics

df = pd.read_csv("scaled_final_complete_features_with_labels_and_ids", sep=",", encoding="utf8")

data_tfidf = df[[str(x) for x in range(1000)]]
data_readability = df[
    ['avg_word_count', 'emoji_count', 'slang_count', 'capitalized_count', 'full_capitalized_count',
     'retweets_count', 'user_mentions_count', 'hashtags_count', 'url_count']]
data_sentiment = df[
    ['anger', 'fear', 'joy', 'sadness', 'negation', 'vader_compound_score', 'textblob_polarity_score']]
data_personality = df[['extraversion', 'avoidance', 'conscientiousness', 'openness', 'neuroticism',
                       'agreeableness', 'anxiety']]
data_gender = df[['gender']]
data_liwc = df[['Analytic', 'Clout', 'Authentic', 'Tone']]
data_ground_truth = df[['ground_truth']]

features = list()
# features.append([data_readability, data_sentiment, data_personality, data_gender, data_liwc, data_ground_truth])
features.append([data_tfidf, data_readability, data_sentiment, data_personality, data_gender, data_ground_truth])

for feature_combination in features:
    features = pd.concat([i for i in feature_combination], axis=1)

    X = features.drop(['ground_truth'], axis=1).reset_index(drop=True)
    y = features[['ground_truth']].values.ravel()

    print(X)
    print(y)

       0         1         2         3         4    5         6    7  \
0    0.0  1.000000  0.000000  0.000000  0.000000  0.0  0.000000  0.0   
1    0.0  0.000000  0.000000  0.000000  0.148459  0.0  0.121402  0.0   
2    0.0  0.000000  0.000000  0.000000  0.070025  0.0  0.229050  0.0   
3    0.0  0.068420  0.179247  0.000000  0.000000  0.0  0.000000  0.0   
4    0.0  0.091617  0.000000  0.000000  0.000000  0.0  0.000000  0.0   
..   ...       ...       ...       ...       ...  ...       ...  ...   
295  0.0  0.000000  0.355588  0.097166  0.000000  0.0  0.000000  0.0   
296  0.0  0.000000  0.000000  0.000000  0.000000  0.0  0.000000  0.0   
297  0.0  0.000000  0.000000  0.000000  0.000000  0.0  0.000000  0.0   
298  0.0  0.000000  0.000000  0.000000  0.000000  0.0  0.000000  0.0   
299  0.0  0.000000  0.000000  0.000000  0.000000  0.0  0.000000  0.0   

            8         9  ...  vader_compound_score  textblob_polarity_score  \
0    0.000000  0.000000  ...              0.999850      

In [None]:
filename = "classifiers/fake_news/" \
           "Gradient Boosting_phase_C_tfidf_readability_sentiment_personality_gender_0.7299999999999999.sav"
model = joblib.load(filename)

y_predicted = model.predict(X.values)
perm = PermutationImportance(model).fit(X, y)
eli5.show_weights(perm, feature_names=X.columns.tolist(), top=20)

In [None]:
sample = 1

print('Reference:', y[sample])
print('Predicted:', y_predicted[sample])
eli5.show_prediction(model, X.iloc[sample],
                     feature_names=X.columns.tolist(), show_feature_values=True)

Reference: 1.0
Predicted: 1.0


Contribution?,Feature,Value
1.1,744,0.346
0.71,820,0.316
0.693,128,0.623
0.671,45,0.44
0.667,95,0.466
0.663,22,0.31
0.638,43,0.213
0.609,903,0.462
0.566,vader_compound_score,0.004
0.565,273,0.351


In [None]:
sample = 2

print('Reference:', y[sample])
print('Predicted:', y_predicted[sample])
eli5.show_prediction(model, X.iloc[sample],
                     feature_names=X.columns.tolist(), show_feature_values=True)

Reference: 0.0
Predicted: 0.0


Contribution?,Feature,Value
1.779,479,0.255
1.257,312,0.54
1.235,777,0.464
1.121,406,0.65
1.05,858,0.53
0.832,985,0.297
0.798,user_mentions_count,0.114
0.717,487,0.249
0.674,emoji_count,0.082
0.639,730,0.146


In [None]:
!pip install pdpbox
from pdpbox import pdp, get_dataset, info_plots

def plot_pdp(model, df, feature, cluster_flag=False, nb_clusters=None, lines_flag=False):

    # Create the data that we will plot
    pdp_goals = pdp.pdp_isolate(model=model, dataset=df, model_features=df.columns.tolist(), feature=feature)

    # plot it
    pdp.pdp_plot(pdp_goals, feature, cluster=cluster_flag, n_cluster_centers=nb_clusters, plot_lines=lines_flag)
    plt.savefig('explanations/pdp/{0}_pdp.png'.format(feature))

# plot the PD univariate plot for each feature
for feature in X.columns:
    plot_pdp(model, X, feature)

# plot_pdp(model, X, 'capitalized_count')
# plot_pdp(model, X, 'hashtags_count')



In [None]:
!pip install lime
import lime.lime_tabular

# Change the number below to test other instances
sample = 1
y_true = y[sample]
y_pp = y_predicted[sample]
explainer = lime.lime_tabular.LimeTabularExplainer(X.values,feature_names=X.columns.values.tolist()
                                                   ,class_names=[0,1])
predict_fn = lambda x: model.predict_proba(x).astype(float)
explanation = explainer.explain_instance(X.values[sample], predict_fn, num_features=10)

print('Reference:', y[sample])
print('Predicted:', y_predicted[sample])
explanation.show_in_notebook()

In [None]:
!pip install shap
import shap

# load JS visualization code to notebook
shap.initjs()

shap_explainer = shap.TreeExplainer(model)
shap_values = shap_explainer.shap_values(X)

print('Expected Value: ', shap_explainer.expected_value)
shap.summary_plot(shap_values[1], X, plot_type="bar")

In [None]:
# load JS visualization code to notebook

shap.initjs()

explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X)

# change the value below to change instance
sample=1

shap.force_plot(explainer.expected_value[1], shap_values[1][sample,:], X.iloc[sample,:], matplotlib=True)



In [None]:
shap.summary_plot(shap_values[1], X)


In [None]:
shap.initjs()
feature = "hashtags_count"

shap.dependence_plot(feature, shap_values[1], X,)