In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np


from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC
from sklearn.naive_bayes import GaussianNB

from sklearn import metrics

from sklearn import preprocessing


In [2]:
def read_preprocess(path):
    df=pd.read_csv(path,index_col=0,dtype='str')
    df=df.dropna().copy()
    df.index=pd.RangeIndex(start=0, stop=df.shape[0], step=1)
    indices_1=df[df["label"]=="1"].index
    indices_0=df[df["label"]=="0"].index
    df["label"]=df["label"].apply(int)

    df['author_rate']=(df['author_followers'].apply(float)+0.0001)/(df['author_following'].apply(float)+0.0001)
    df['mentioned']=df['num_NGO_mentions'].apply(lambda x: int(int(x)>0))
    # df['question']=df['num_question'].apply(lambda x: int(int(x)>0))
    df['sentiment']=df['sentiment'].apply(lambda x: -1 if x=='negative' else (0 if x=='neutral' else 1))
    
    return df

In [3]:
# res=pd.DataFrame(columns=["name","features","DT","RF","LR","SVM","NB"])
res=pd.read_csv("results/res.csv")

In [4]:
def run_all_models(X_scaled,y,X_scaled_svm,y_svm):
    clf = DecisionTreeClassifier(random_state=0,class_weight="balanced")
    res_dt=cross_val_score(clf, X_scaled, y, cv=10,scoring='balanced_accuracy')
    print("Decision Tree:\n",res_dt,'\n',res_dt.mean())
    
    clf = RandomForestClassifier(random_state=0,class_weight="balanced")
    res_rf=cross_val_score(clf, X_scaled, y, cv=10,scoring='balanced_accuracy')
    print("RandomForest:\n",res_rf,'\n',res_rf.mean())
    
    clf = LogisticRegression(random_state=0,class_weight="balanced")
    res_lr=cross_val_score(clf, X_scaled, y, cv=10,scoring='balanced_accuracy')
    print("Logistic Regression:\n",res_lr,'\n',res_lr.mean())
    
    clf = SVC(random_state=0,class_weight="balanced")
    res_svm=cross_val_score(clf, X_scaled_svm, y_svm, cv=5,scoring='balanced_accuracy',n_jobs=-1)
    print("SVM:\n",res_svm,'\n',res_svm.mean())
    
    clf=GaussianNB()
    res_nb=cross_val_score(clf, X_scaled, y, cv=10,scoring='balanced_accuracy')
    print("Naive Bayesian:\n",res_nb,'\n',res_nb.mean())
    
    return res_dt.mean(),res_rf.mean(),res_lr.mean(),res_svm.mean(),res_nb.mean()

In [5]:
def get_x_y(df,features,large=True):
    X=df[features]
    scaler = preprocessing.StandardScaler().fit(X)
    scaler

    X_scaled = scaler.transform(X)
    y=df['label']

    df_svm=df.sample(frac=0.4).copy()
    X_svm=df_svm[features]

    scaler = preprocessing.StandardScaler().fit(X)
    scaler

    X_scaled_svm = scaler.transform(X_svm)

    y_svm=df_svm['label']
    
    if large:
        return X_scaled,y,X_scaled_svm,y_svm
    else:
        return X_scaled,y,X_scaled,y

In [6]:
def add_res(res,name,features,dt,rf,lr,svm,nb):
    t=pd.DataFrame([name,features,dt,rf,lr,svm,nb]).transpose()
    t.columns=["name","features","DT","RF","LR","SVM","NB"]
    res=pd.concat([res,t])
    return res

In [7]:
features=['author_rate','num_full_words','num_tokenized_words','mentioned','num_hashtags','attachment','num_NGO_mentions','num_mentions','num_exclamation', 'has_question','sentiment','retweet','topic','num_characters']

### NGOs

In [8]:
name="ngos"
path='datasets/extended_features.csv'
df=read_preprocess(path)
X,y,X_svm,y_svm=get_x_y(df,features,large=True)

In [9]:
dt,rf,lr,svm,nb=run_all_models(X,y,X_svm,y_svm)

Decision Tree:
 [0.5283885  0.54468775 0.54487394 0.52345509 0.53862533 0.53173144
 0.52151593 0.51723246 0.52944901 0.51536283] 
 0.5295322292862142
RandomForest:
 [0.50949537 0.51415873 0.51390241 0.50681239 0.51718424 0.50787127
 0.50667357 0.50982134 0.50402519 0.5099    ] 
 0.5099844516323297
Logistic Regression:
 [0.68681215 0.73871603 0.68919575 0.64472495 0.65242756 0.62504626
 0.6346866  0.54705454 0.62337462 0.58223054] 
 0.6424269007971002
SVM:
 [0.68017502 0.70157153 0.70049634 0.67371319 0.67365694] 
 0.6859226024205711
Naive Bayesian:
 [0.54580261 0.55623465 0.5185099  0.53070721 0.51983883 0.51075848
 0.53320401 0.52840541 0.51726444 0.53828532] 
 0.5299010843071603


In [10]:
res=add_res(res,name,features,dt,rf,lr,svm,nb)

### Banks

In [11]:
name="banks"
path='datasets/Banks_extended_features.csv'
df=read_preprocess(path)
print(df.shape)

(21659, 28)


In [12]:
X,y,X_svm,y_svm=get_x_y(df,features,large=False)

KeyError: "['has_question', 'retweet', 'topic', 'num_characters'] not in index"

In [None]:
dt,rf,lr,svm,nb=run_all_models(X,y,X_svm,y_svm)

Decision Tree:
 [0.5911189  0.57064709 0.58232545 0.60430909 0.57928025 0.59189586
 0.54796305 0.63845408 0.64044799 0.55796493] 
 0.5904406686818795
RandomForest:
 [0.56645486 0.5555872  0.58562664 0.58225241 0.5665495  0.57384638
 0.55981624 0.65927443 0.64858487 0.59079893] 
 0.5888791467406074
Logistic Regression:
 [0.60725241 0.59505551 0.66401548 0.72060327 0.62790633 0.66172895
 0.64795457 0.69357341 0.70590175 0.60817805] 
 0.6532169731612386
SVM:
 [0.60881726 0.69064718 0.63789021 0.67351753 0.65696764] 
 0.6535679660299089
Naive Bayesian:
 [0.57392638 0.5634385  0.61069968 0.63706544 0.59987637 0.61774282
 0.59941698 0.59314311 0.62817663 0.58687736] 
 0.6010363261277301


In [None]:
res=add_res(res,name,features,dt,rf,lr,svm,nb)

### Airlines

In [None]:
name="airlines"
path='datasets/Airlines_extended_features.csv'
print(df.shape)
df=read_preprocess(path)

(21659, 29)


In [None]:
X,y,X_svm,y_svm=get_x_y(df,features,large=False)

In [None]:
dt,rf,lr,svm,nb=run_all_models(X,y,X_svm,y_svm)

Decision Tree:
 [0.56266831 0.56872059 0.54406571 0.57191244 0.57794528 0.57231014
 0.56456225 0.54863199 0.58803602 0.57135109] 
 0.5670203831010882
RandomForest:
 [0.61218919 0.5991418  0.58516498 0.578504   0.60077805 0.56876206
 0.5843196  0.57905718 0.61848971 0.62858299] 
 0.5954989562064834
Logistic Regression:
 [0.59925991 0.62500938 0.58476103 0.52085329 0.59498744 0.64919696
 0.5551368  0.59230347 0.65234629 0.68657696] 
 0.606043154227161
SVM:
 [0.64003673 0.59546511 0.6551309  0.61535816 0.67260289] 
 0.6357187591533684
Naive Bayesian:
 [0.54816813 0.56932506 0.54101181 0.52455961 0.57872472 0.5400422
 0.52570274 0.53223254 0.53955405 0.53066089] 
 0.5429981751437661


In [None]:
res=add_res(res,name,features,dt,rf,lr,svm,nb)

### ICRC

In [13]:
name="icrc"
path='datasets/ICRC_extended_features.csv'
df=read_preprocess(path)
print(df.shape)

(67420, 29)


In [14]:
X,y,X_svm,y_svm=get_x_y(df,features,large=False)

KeyError: "['retweet', 'topic', 'num_characters'] not in index"

In [None]:
dt,rf,lr,svm,nb=run_all_models(X,y,X_svm,y_svm)

Decision Tree:
 [0.58763965 0.5712159  0.5542945  0.55288096 0.5619915  0.54767014
 0.55271906 0.54904524 0.55729979 0.57411709] 
 0.5608873831674795
RandomForest:
 [0.58245666 0.57275185 0.56331641 0.54808787 0.57353816 0.54421357
 0.55424165 0.55633515 0.54954146 0.57229431] 
 0.5616777089782194
Logistic Regression:
 [0.73921577 0.78067686 0.72209115 0.65046835 0.66931671 0.63179505
 0.63524715 0.6541867  0.59148127 0.74500116] 
 0.6819480161493902
SVM:
 [0.74682601 0.71373701 0.67662393 0.65049841 0.71330208] 
 0.7001974877417465
Naive Bayesian:
 [0.64282075 0.70194947 0.65526286 0.63641373 0.66052965 0.54556698
 0.53690574 0.55088262 0.5412485  0.57498182] 
 0.6046562122538796


In [None]:
res=add_res(res,name,features,dt,rf,lr,svm,nb)

### Other NGOs

In [None]:
name="other_ngos"
path='datasets/OtherNGO_extended_features.csv'
df=read_preprocess(path)
print(df.shape)

(48058, 30)


In [None]:
X,y,X_svm,y_svm=get_x_y(df,features,large=False)

In [None]:
dt,rf,lr,svm,nb=run_all_models(X,y,X_svm,y_svm)

Decision Tree:
 [0.49936143 0.50947212 0.51426139 0.50757852 0.51715911 0.5120397
 0.49594608 0.57048223 0.5658675  0.49561974] 
 0.5187787809528526
RandomForest:
 [0.48919753 0.4902086  0.49856322 0.50302946 0.50485865 0.50186826
 0.49824893 0.54292023 0.51984423 0.49088957] 
 0.5039628687465282
Logistic Regression:
 [0.4691358  0.63197105 0.63633461 0.56557865 0.55455659 0.59193161
 0.55982151 0.54326302 0.6208011  0.54283073] 
 0.5716224679004711
SVM:
 [0.48806282 0.6303657  0.60370528 0.62316716 0.58632483] 
 0.586325156754284
Naive Bayesian:
 [0.56758195 0.50771605 0.50723712 0.56165755 0.53738576 0.55576368
 0.54491155 0.50951515 0.54766242 0.51653636] 
 0.5355967590748814


In [None]:
res=add_res(res,name,features,dt,rf,lr,svm,nb)

### Save results

In [15]:
res

Unnamed: 0,name,features,DT,RF,LR,SVM,NB
0,ngos,"['author_rate', 'num_full_words', 'num_tokeniz...",0.549477,0.546262,0.636432,0.671548,0.567851
1,banks,"['author_rate', 'num_full_words', 'num_tokeniz...",0.590441,0.588879,0.653217,0.653568,0.601036
2,airlines,"['author_rate', 'num_full_words', 'num_tokeniz...",0.56702,0.595499,0.606043,0.635719,0.542998
3,icrc,"['author_rate', 'num_full_words', 'num_tokeniz...",0.560887,0.561678,0.681948,0.700197,0.604656
4,other_ngos,"['author_rate', 'num_full_words', 'num_tokeniz...",0.518779,0.503963,0.571622,0.586325,0.535597
0,ngos,"[author_rate, num_full_words, num_tokenized_wo...",0.529532,0.509984,0.642427,0.685923,0.529901


In [16]:
res.to_csv('results/res.csv',index=False)