In [1]:
import pandas as pd
import numpy as np

from sklearn.metrics import confusion_matrix, f1_score, precision_score,\
recall_score, confusion_matrix, classification_report, accuracy_score 
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.feature_selection import SelectFromModel
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sys import argv
import gc
from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import StandardScaler, Normalizer
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

In [401]:
def ImportantFea(X, y):
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=35)
	
	model = ExtraTreesClassifier(random_state = 0)
	model.fit(X_train, y_train)
	#print(model.feature_importances_)

	print("Feature ranking:")

	importances = model.feature_importances_
	std = np.std([tree.feature_importances_ for tree in model.estimators_],
             axis=0)
	indices = np.argsort(importances)[::-1]
	for f in range(X.shape[1]):
		print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

	feat_importances = pd.Series(model.feature_importances_, index=X.columns)
	feat_importances.nlargest(20).plot(kind='barh')
	plt.show()

	fea_df = pd.DataFrame(feat_importances)
	fea_df['features'] = fea_df.index
	fea_df.columns = ['importance','features']
	fea_df.to_csv(path + 'topFea.csv')
	return fea_df

def ExTreeClassifier(X,y):
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=35)
	smote_enn = SMOTEENN(random_state=42)
	rf = make_pipeline(smote_enn, StandardScaler(), ExtraTreesClassifier(random_state = 0))
	cv = StratifiedKFold(n_splits=5, random_state = 0)

	parameters = [{'extratreesclassifier__max_features':['auto','sqrt','log2'], 'extratreesclassifier__class_weight':['balanced'], 
	             'extratreesclassifier__max_leaf_nodes':[10,50,100], 'extratreesclassifier__max_depth':[2,5,10,20], 'extratreesclassifier__n_estimators' : [50,100,200,300,400]}]
	             
	grid_search_item = GridSearchCV(rf,
	                            param_grid = parameters,
	                             scoring = 'accuracy',
	                             cv = cv,
	                             n_jobs = -1)

	grid_search = grid_search_item.fit(X_train, y_train)

	print('Best scores and best parameters')
	print(grid_search.best_score_)
	print(grid_search.best_params_)

	y_true, y_pred = y_test, grid_search.predict(X_test)
	print(classification_report(y_true, y_pred))

	importance = grid_search.best_estimator_.steps[2][1].feature_importances_

	feat_importances = pd.Series(importance, index=X.columns)
	feat_importances.nlargest(20).plot(kind='barh')
	plt.show()

	fea_df = pd.DataFrame(feat_importances)
	fea_df['features'] = fea_df.index
	fea_df.columns = ['importance','features']
	fea_df.to_csv(path + 'topFea.csv')
	return fea_df

def RFfeatureSel(X_train,y_train):
	
	smote_enn = SMOTEENN(random_state=42)
	rf = make_pipeline(smote_enn, StandardScaler(), RandomForestClassifier(random_state=20))
	cv = StratifiedKFold(n_splits=5, random_state = 0)

	parameters = [{'randomforestclassifier__max_features':['auto','sqrt','log2'], 'randomforestclassifier__class_weight':['balanced'], 
	             'randomforestclassifier__max_leaf_nodes':[10,50,100], 'randomforestclassifier__max_depth':[2,5,10,20], 'randomforestclassifier__n_estimators' : [50,100,200,300,400]}]
	             
	grid_search_item = GridSearchCV(rf,
	                            param_grid = parameters,
	                             scoring = 'accuracy',
	                             cv = cv,
	                             n_jobs = -1)

	grid_search = grid_search_item.fit(X_train, y_train)

	print('Best scores and best parameters')
	print(grid_search.best_score_)
	print(grid_search.best_params_)


	importance = grid_search.best_estimator_.steps[2][1].feature_importances_

	feat_importances = pd.Series(importance, index=X_train.columns)
	feat_importances.nlargest(20).plot(kind='barh')
	plt.show()

	fea_df = pd.DataFrame(feat_importances)
	fea_df['features'] = fea_df.index
	fea_df.columns = ['importance','features']
	fea_df.to_csv(path + 'topFea.csv')
	return fea_df


def selectFea(FeaImportant, Nfea):
    sortFea = FeaImportant.sort_values(by=['importance'],ascending=False)
    select = sortFea[1:Nfea]   
    return select.features.values


def SVMclassifier(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=35)

    smote_enn = SMOTEENN(random_state=42)
    #ps = PredefinedSplit(test_fold=y_test)
    cv = StratifiedKFold(n_splits=5, random_state = 0)
    svc = make_pipeline(StandardScaler(), svm.SVC())
    parameters = [{'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'svc__gamma': [0.01, 0.001, 0.0001],
                         'svc__C':[0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.5, 2.0, 10] , 'svc__class_weight':['balanced']}]

    grid_search_item = GridSearchCV(estimator = svc,
                              param_grid = parameters,
                               cv =  cv,
                               scoring = 'accuracy',
                               n_jobs = -1)
    grid_search = grid_search_item.fit(X_train, y_train)

    print('Best scores and best parameters')
    print(grid_search.best_score_)
    print(grid_search.best_params_)

    y_true, y_pred = y_test, grid_search.predict(X_test)
    print(classification_report(y_true, y_pred))
    print(confusion_matrix(y_true,y_pred))


def GetLIWC(file:str): 
	liwc = pd.read_csv(file)
	liwc = liwc.rename(columns = {liwc.columns[2]:'user_id'})
	liwcUser = liwc.groupby('user_id').mean().reset_index()
	liwcUser = liwcUser.drop(['Source (A)', 'Source (D)'], axis=1)
	return liwcUser


def mergeFea(features, liwc, empath): 
	features = pd.read_csv(features)

	#merge features
	liwcUser = GetLIWC(liwc)
	liwcUser2 = liwcUser.iloc[:,1::]
	liwcUser2.columns = [str(col) + '_liwc' for col in liwcUser2.columns]
	liwcUser2['user_id'] = liwcUser.user_id

	empath = pd.read_csv(empath)
	empath2 = empath.iloc[:,1::]
	empath2.columns = [str(col) + '_empath' for col in empath2.columns]
	empath2['user_id'] = empath.user_id

	allfea = pd.merge(features, liwcUser2, on = 'user_id', how = 'right')
	allfea = pd.merge(allfea, empath2, on = 'user_id', how = 'right')
	return allfea

def getCountVect(user_idFile, countVec):
	text = pd.read_csv(user_idFile)
	countVect = pd.read_csv(countVec)
	countVect['user_id'] = text['user_id']
	countVec2 = countVect.groupby(['user_id']).mean().reset_index()
	return countVec2

def featureUnionTest(selectFea, train_x, test_x):
    #select trainset features
    train_xSel = train_x.loc[:, train_x.columns.isin(selectFea)]
    test_x2 = test_x.loc[:, test_x.columns.isin(train_xSel)]
    test_x2['user_id'] = test_x.user_id
    #remove not in testset
    train_xSel = train_x.loc[:, train_x.columns.isin(test_x2.columns)]
    train_xSel['user_id'] = train_x.user_id
    #print('trainset and testset shape', train_x.shape, test_x.shape)
    allfea = test_x2.append(train_xSel)
    testSel = allfea[0:125]
    return testSel
    
    
def featureUnionTrain(selectFea, train_x, test_x):
    #select trainset features
    train_xSel = train_x.loc[:, train_x.columns.isin(selectFea)]
    test_x2 = test_x.loc[:, test_x.columns.isin(train_xSel)]
    test_x2['user_id'] = test_x.user_id
    #remove not in testset
    train_xSel = train_x.loc[:, train_x.columns.isin(test_x2.columns)]
    train_xSel['user_id'] = train_x.user_id
    #print('trainset and testset shape', train_x.shape, test_x.shape)
    allfea = test_x2.append(train_xSel)
    trainSel = allfea[125:621]
    return trainSel


def SVMPredictTestProb(X_train, y_train, X_test):
    
    smote_enn = SMOTEENN(random_state=42)
    cv = StratifiedKFold(n_splits=5, random_state = 0)
    svc = make_pipeline(StandardScaler(), svm.SVC(probability=True))
    parameters = [{'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'svc__gamma': [0.01, 0.001, 0.0001],
                         'svc__C':[0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.5, 2.0, 10] , 'svc__class_weight':['balanced']}]


    grid_search_item = GridSearchCV(estimator = svc,
                              param_grid = parameters,
                               cv =  cv,
                               scoring = 'accuracy',
                               n_jobs = -1)
    grid_search = grid_search_item.fit(X_train, y_train)

    #y_pred = grid_search.predict(X_train)
    #print(classification_report(y_train, y_pred))

    print('Best scores and best parameters')
    print(grid_search.best_score_)
    print(grid_search.best_params_)

    y_pred = grid_search.predict_proba(X_test)
    X_test['classA'] = y_pred[:,0]
    X_test['classB'] = y_pred[:,1]
    X_test['classC'] = y_pred[:,2]
    X_test['classD'] = y_pred[:,3]
    ConfidenceS = X_test[['user_id', 'classA', 'classB', 'classC', 'classD']]

    return ConfidenceS

def SVMPredictTest(X_train, y_train, X_test):
    
    smote_enn = SMOTEENN(random_state=42)
    cv = StratifiedKFold(n_splits=5, random_state = 0)
    svc = make_pipeline(StandardScaler(), svm.SVC())
    parameters = [{'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'svc__gamma': [0.01, 0.001, 0.0001],
                         'svc__C':[0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.5, 2.0, 10] , 'svc__class_weight':['balanced']}]


    grid_search_item = GridSearchCV(estimator = svc,
                              param_grid = parameters,
                               cv =  cv,
                               scoring = 'accuracy',
                               n_jobs = -1)
    grid_search = grid_search_item.fit(X_train, y_train)

    #y_pred = grid_search.predict(X_train)
    #print(classification_report(y_train, y_pred))

    print('Best scores and best parameters')
    print(grid_search.best_score_)
    print(grid_search.best_params_)

    y_pred = grid_search.predict(X_test)
    test = X_test.copy()
    test['predict_label_ML'] = y_pred
    idResults = test[['user_id','predict_label_ML']]

    return idResults


def getConfidenceScore(resultProbablity,resultLabel):
    resultFile = pd.merge(resultProbablity,resultLabel, on = 'user_id')
    conScore = {}
    for user, a, b, c, d, labels in zip(resultFile['user_id'], resultFile['classA'], resultFile['classB'], resultFile['classC'], resultFile['classD'], resultFile['predict_label_ML']):
        if labels is 'a':
            conScore[user] = a
        elif labels is 'b':
            conScore[user] = b
        elif labels is 'c':
            conScore[user] = c
        elif labels is 'd':
            conScore[user] = d
    conScoredf = pd.DataFrame.from_dict(conScore, orient='index', columns = ['confidenceScore'])   
    conScoredf['user_id'] = conScoredf.index
    return conScoredf

In [332]:
selectFeatures = selectFea(feaImp,500) 
test = featureUnionTest(selectFeatures, allfea3, allfeaTest)
train = featureUnionTrain(selectFeatures, allfea3, allfeaTest)
print(train.shape,test.shape)

(496, 132) (125, 132)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc

In [316]:
print(train.isnull().values.any())
print(test.isnull().values.any())

False
False


In [281]:
#path = '/Users/lucia/phd_work/Clpsy/'
path = '/home/lucia/phd_work/shareTask/'

#merge features
features = path + 'suicideDetection/features/FreqSentiMotiTopiFea.csv'
liwc = path + 'suicideDetection/features/liwcSW.csv'
empath = path + 'suicideDetection/features/empathSW.csv'
allfea = mergeFea(features, liwc, empath)

#select features and split train test
X = allfea.iloc[:, 3:146]
y = allfea.raw_label
#y = y.replace(['a', 'b', 'c', 'd'], [1, 2, 2, 2]) 

In [295]:
##add count vect

user_idFile = path + 'data/clpsych19_training_data/Btrain_NoNoise_SW.csv'
countVec = path + 'countVec2.csv'
countVec2 = getCountVect(user_idFile, countVec)
#add tfidf count vect as features
allfea = pd.merge(allfea, countVec2, on = 'user_id', how = 'right')

y = allfea.raw_label
# #y = y.replace(['a', 'b', 'c', 'd'], [1, 2, 2, 2]) 
allfea2 = allfea.iloc[:, 1::]
allfea3 = allfea2.drop(['raw_label'],axis = 1)
#allfea3 = allfea3.drop(['user_id'],axis = 1)
# #RF Select feature
#feaImp = RFfeatureSel(allfea3,y)
feaImp = pd.read_csv(path + 'topFea.csv')
#selectFea = selectFea(feaImp,200) 


In [283]:
print('get test set')
featuresT = path + 'suicideDetection/TestFeatures/FreqSentiMotiTopiFea.csv'
liwcT = path + 'suicideDetection/TestFeatures/liwcSW.csv'
empathT = path + 'suicideDetection/TestFeatures/empathSW.csv'
testFea = mergeFea(featuresT, liwcT, empathT)

user_idFile = path + 'data/clpsych19_training_data/testSW.csv'
countVec = path + 'suicideDetection/TestFeatures/countVec2.csv'
countVec2 = getCountVect(user_idFile, countVec)
#add tfidf count vect as features
allfeaTest = pd.merge(testFea, countVec2, on = 'user_id', how = 'right')


# selectedFeaTest = allfeaTest.loc[:, allfeaTest.columns.isin(selectFea)]
# selectedFeaTest['user_id'] = allfeaTest.user_id
#selectedFeaTest.to_csv(path + 'suicideDetection/TestFeatures/selectFeaTest.csv')

#feature union 
# train = featureUnionTrain(selectedFeaX,selectedFeaTest)    
# test = featureUnionTest(selectedFeaX,selectedFeaTest)
# result = SVMPredictTest(train, y, test)

get test set
(125, 329)
(496, 329)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc

In [185]:
#get confidence score
train = featureUnionTrain(selectedFeaX,selectedFeaTest)    
test = featureUnionTest(selectedFeaX,selectedFeaTest)
resultP = SVMPredictTestProb(train, y, test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Best scores and best parameters
0.4012096774193548
{'svc__C': 0.3, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01, 'svc__kernel': 'sigmoid'}


  Xt = transform.transform(Xt)


In [199]:
# MLresults = pd.merge(resultP, result, on = 'user_id')
# MLresults.to_csv(path + '/data/clpsych19_training_data/testResultsCS.csv')

In [223]:
conScoredf=getConfidenceScore(resultP, result)
conScoredf2 = pd.merge(conScoredf, result, on = 'user_id')
conScoredf2.head(2)

Unnamed: 0,confidenceScore,user_id,predict_label_ML
0,0.575328,195.0,c
1,0.854208,450.0,a


In [133]:
# test['predicted_labels_ML'] = result
# idResults = test[['user_id','predicted_labels_ML']]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [224]:
lmRe = pd.read_csv(path + '/data/clpsych19_training_data/testResults.csv')
lmRe = pd.merge(lmRe, conScoredf2, on = 'user_id')
lmRe.to_csv(path + '/data/clpsych19_training_data/testResultsBoth.csv')

In [399]:
#top 300 features
selectFeatures = selectFea(feaImp,400)
test = featureUnionTest(selectFeatures, allfea3, allfeaTest)
train = featureUnionTrain(selectFeatures, allfea3,400, allfeaTest)
print(train.shape,test.shape)
resultN = SVMclassifier(train, y) #train without manual features

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc

(496, 125) (125, 125)
Best scores and best parameters
0.5446685878962536
{'svc__C': 2.0, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}
              precision    recall  f1-score   support

           a       0.65      0.74      0.69        35
           b       0.67      0.22      0.33        18
           c       0.25      0.28      0.26        32
           d       0.60      0.62      0.61        64

   micro avg       0.53      0.53      0.53       149
   macro avg       0.54      0.47      0.48       149
weighted avg       0.54      0.53      0.52       149

[[26  0  6  3]
 [ 6  4  3  5]
 [ 3  1  9 19]
 [ 5  1 18 40]]


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


In [402]:
selectFeatures = selectFea(feaImp,400) 
test = featureUnionTest(selectFeatures, allfea3, allfeaTest)
train = featureUnionTrain(selectFeatures, allfea3, allfeaTest)
print(train.shape,test.shape)
#resultN = SVMclassifier(train, y) #just testing with train set
result = SVMPredictTest(train, y, test)#train labels
resultP = SVMPredictTestProb(train, y, test) #train confidence scores
print(train.shape,test.shape)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the doc

(496, 125) (125, 125)


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)
  Xt = transform.transform(Xt)


Best scores and best parameters
0.5201612903225806
{'svc__C': 0.9, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}


  return self.partial_fit(X, y)
  return self.fit(X, y, **fit_params).transform(X)


Best scores and best parameters
0.5201612903225806
{'svc__C': 0.9, 'svc__class_weight': 'balanced', 'svc__gamma': 0.01, 'svc__kernel': 'rbf'}
(496, 125) (125, 129)


  Xt = transform.transform(Xt)


In [403]:
#compare with LM results
#merge confidence score
conScoredf=getConfidenceScore(resultP, result)
conScoredf2 = pd.merge(conScoredf, result, on = 'user_id')
lmRe = pd.read_csv(path + '/data/clpsych19_training_data/testResults.csv')
lmRe = pd.merge(lmRe, conScoredf2, on = 'user_id')
lmRe.to_csv(path + '/data/clpsych19_training_data/testResultsBoth3.csv')

In [404]:
print(lmRe.predict_label_ML.value_counts())
NoDup = lmRe.drop_duplicates(subset='user_id', keep="last")
print(NoDup.predict_label_ML.value_counts())
NoDup = NoDup[['user_id','predict_label_ML','confidenceScore']]
NoDup.to_csv(path + '/suicideDetection/results/MachineLearningModel.csv',header=None,index=False)

d    83
a    54
c    37
b    12
Name: predict_label_ML, dtype: int64
a    47
d    40
c    29
b     9
Name: predict_label_ML, dtype: int64


We seem to find that the ML model compensate the LM model in class A, so this classifer is to convert class A with low confidence score to the ML model result

In [405]:
from collections import Counter
newLabel = {}
for user, conScore, LanMLab, MaLab in zip(lmRe['userid'], lmRe['confidenceScore'], lmRe['predicted_label'], lmRe['predict_label_ML']):
    if MaLab is 'a' and conScore < 0.40:      
        newLabel[user] =  LanMLab
    if LanMLab is 'b':
        newLabel[user] = LanMLab
    else:
        newLabel[user] = MaLab

Counter(newLabel.values())

Counter({'a': 41, 'b': 20, 'c': 25, 'd': 39})

In [406]:
HybridModel = pd.DataFrame.from_dict(newLabel, orient='index', columns = ['label']) 
HybridModel['userid'] = HybridModel.index
HybridModel = HybridModel[['userid','label']]
HybridModel.to_csv(path + '/suicideDetection/results/HybriedModel.csv',header=None,index=False) 