In [2]:
import pandas as pd
import numpy as np

from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, f1_score, precision_score,\
recall_score, confusion_matrix, classification_report, accuracy_score 
from sklearn.model_selection import GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.linear_model import SGDClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.svm import LinearSVC
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sys import argv
import gc
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import Pipeline
from imblearn.pipeline import make_pipeline, Pipeline
from imblearn.combine import SMOTEENN
from sklearn.preprocessing import StandardScaler, Normalizer

from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import ExtraTreesClassifier
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier

In [10]:
def ImportantFea(X, y):
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=35)
	
	model = ExtraTreesClassifier(random_state = 0)
	model.fit(X_train, y_train)
	#print(model.feature_importances_)

	print("Feature ranking:")

	importances = model.feature_importances_
	std = np.std([tree.feature_importances_ for tree in model.estimators_],
             axis=0)
	indices = np.argsort(importances)[::-1]
	for f in range(X.shape[1]):
		print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))

	feat_importances = pd.Series(model.feature_importances_, index=X.columns)
	feat_importances.nlargest(20).plot(kind='barh')
	plt.show()

	fea_df = pd.DataFrame(feat_importances)
	fea_df['features'] = fea_df.index
	fea_df.columns = ['importance','features']
	fea_df.to_csv(path + 'topFea.csv')
	return fea_df

def ExTreeClassifier(X,y):
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=35)
	smote_enn = SMOTEENN(random_state=42)
	rf = make_pipeline(smote_enn, StandardScaler(), ExtraTreesClassifier(random_state = 0))
	cv = StratifiedKFold(n_splits=5, random_state = 0)

	parameters = [{'extratreesclassifier__max_features':['auto','sqrt','log2'], 'extratreesclassifier__class_weight':['balanced'], 
	             'extratreesclassifier__max_leaf_nodes':[10,50,100], 'extratreesclassifier__max_depth':[2,5,10,20], 'extratreesclassifier__n_estimators' : [50,100,200,300,400]}]
	             
	grid_search_item = GridSearchCV(rf,
	                            param_grid = parameters,
	                             scoring = 'accuracy',
	                             cv = cv,
	                             n_jobs = -1)

	grid_search = grid_search_item.fit(X_train, y_train)

	print('Best scores and best parameters')
	print(grid_search.best_score_)
	print(grid_search.best_params_)

	y_true, y_pred = y_test, grid_search.predict(X_test)
	print(classification_report(y_true, y_pred))

	importance = grid_search.best_estimator_.steps[2][1].feature_importances_

	feat_importances = pd.Series(importance, index=X.columns)
	feat_importances.nlargest(20).plot(kind='barh')
	plt.show()

	fea_df = pd.DataFrame(feat_importances)
	fea_df['features'] = fea_df.index
	fea_df.columns = ['importance','features']
	fea_df.to_csv(path + 'topFea.csv')
	return fea_df

#RF to show important features
def RFClassifier(X,y):
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=35)
	smote_enn = SMOTEENN(random_state=42)
	rf = make_pipeline(smote_enn, StandardScaler(), RandomForestClassifier(random_state=20))
	cv = StratifiedKFold(n_splits=5, random_state = 0)

	parameters = [{'randomforestclassifier__max_features':['auto','sqrt','log2'], 'randomforestclassifier__class_weight':['balanced'], 
	             'randomforestclassifier__max_leaf_nodes':[10,50,100], 'randomforestclassifier__max_depth':[2,5,10,20], 'randomforestclassifier__n_estimators' : [50,100,200,300,400]}]
	             
	grid_search_item = GridSearchCV(rf,
	                            param_grid = parameters,
	                             scoring = 'accuracy',
	                             cv = cv,
	                             n_jobs = -1)

	grid_search = grid_search_item.fit(X_train, y_train)

	print('Best scores and best parameters')
	print(grid_search.best_score_)
	print(grid_search.best_params_)

	y_true, y_pred = y_test, grid_search.predict(X_test)
	print(classification_report(y_true, y_pred))

	importance = grid_search.best_estimator_.steps[2][1].feature_importances_

	feat_importances = pd.Series(importance, index=X.columns)
	feat_importances.nlargest(20).plot(kind='barh')
	plt.show()

	fea_df = pd.DataFrame(feat_importances)
	fea_df['features'] = fea_df.index
	fea_df.columns = ['importance','features']
	fea_df.to_csv(path + 'topFea.csv')
	return fea_df


def RFfeatureSel(X_train,y_train):
	
	smote_enn = SMOTEENN(random_state=42)
	rf = make_pipeline(smote_enn, StandardScaler(), RandomForestClassifier(random_state=20))
	cv = StratifiedKFold(n_splits=5, random_state = 0)

	parameters = [{'randomforestclassifier__max_features':['auto','sqrt','log2'], 'randomforestclassifier__class_weight':['balanced'], 
	             'randomforestclassifier__max_leaf_nodes':[10,50,100], 'randomforestclassifier__max_depth':[2,5,10,20], 'randomforestclassifier__n_estimators' : [50,100,200,300,400]}]
	             
	grid_search_item = GridSearchCV(rf,
	                            param_grid = parameters,
	                             scoring = 'accuracy',
	                             cv = cv,
	                             n_jobs = -1)

	grid_search = grid_search_item.fit(X_train, y_train)

	print('Best scores and best parameters')
	print(grid_search.best_score_)
	print(grid_search.best_params_)


	importance = grid_search.best_estimator_.steps[2][1].feature_importances_

	feat_importances = pd.Series(importance, index=X_train.columns)
	feat_importances.nlargest(20).plot(kind='barh')
	plt.show()

	fea_df = pd.DataFrame(feat_importances)
	fea_df['features'] = fea_df.index
	fea_df.columns = ['importance','features']
	fea_df.to_csv(path + 'topFea.csv')
	return fea_df


def selectFea(FeaImportant, Nfea):
    sortFea = FeaImportant.sort_values(by=['importance'],ascending=False)
    select = sortFea[1:Nfea]   
    return select.features.values


def SVMclassifier(X, y, X_test2):

	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=35)

	smote_enn = SMOTEENN(random_state=42)
	#ps = PredefinedSplit(test_fold=y_test)
	cv = StratifiedKFold(n_splits=5, random_state = 0)
	svc = make_pipeline(StandardScaler(), svm.SVC())
	parameters = [{'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'svc__gamma': [0.01, 0.001, 0.0001],
	                     'svc__C':[0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.5, 2.0, 10] , 'svc__class_weight':['balanced']}]
	                   
	grid_search_item = GridSearchCV(estimator = svc,
	                          param_grid = parameters,
	                           cv =  cv,
	                           scoring = 'accuracy',
	                           n_jobs = -1)
	grid_search = grid_search_item.fit(X_train, y_train)

	print('Best scores and best parameters')
	print(grid_search.best_score_)
	print(grid_search.best_params_)

	y_true, y_pred = y_test, grid_search.predict(X_test)
	print(classification_report(y_true, y_pred))

	y_pred2 = grid_search.predict(X_test2)
	return y_pred2



def SVMPredictTest(X_train, y_train, X_test):

	smote_enn = SMOTEENN(random_state=42)
	cv = StratifiedKFold(n_splits=5, random_state = 0)
	svc = make_pipeline(smote_enn, StandardScaler(), svm.SVC())
	parameters = [{'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'svc__gamma': [0.01, 0.001, 0.0001],
	                     'svc__C':[0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.5, 2.0, 10] , 'svc__class_weight':['balanced']}]
	                    

	grid_search_item = GridSearchCV(estimator = svc,
	                          param_grid = parameters,
	                           cv =  cv,
	                           scoring = 'accuracy',
	                           n_jobs = -1)
	grid_search = grid_search_item.fit(X_train, y_train)

	#y_pred = grid_search.predict(X_train)
	#print(classification_report(y_train, y_pred))

	print('Best scores and best parameters')
	print(grid_search.best_score_)
	print(grid_search.best_params_)

	y_pred = grid_search.predict(X_test)

	return y_pred
	


def GetLIWC(file:str): 
	liwc = pd.read_csv(file)
	liwc = liwc.rename(columns = {liwc.columns[2]:'user_id'})
	liwcUser = liwc.groupby('user_id').mean().reset_index()
	liwcUser = liwcUser.drop(['Source (A)', 'Source (D)'], axis=1)
	return liwcUser


def mergeFea(features, liwc, empath): 
	features = pd.read_csv(features)

	#merge features
	liwcUser = GetLIWC(liwc)
	liwcUser2 = liwcUser.iloc[:,1::]
	liwcUser2.columns = [str(col) + '_liwc' for col in liwcUser2.columns]
	liwcUser2['user_id'] = liwcUser.user_id

	empath = pd.read_csv(empath)
	empath2 = empath.iloc[:,1::]
	empath2.columns = [str(col) + '_empath' for col in empath2.columns]
	empath2['user_id'] = empath.user_id

	allfea = pd.merge(features, liwcUser2, on = 'user_id', how = 'right')
	allfea = pd.merge(allfea, empath2, on = 'user_id', how = 'right')
	return allfea

def getCountVect(user_idFile, countVec):
	text = pd.read_csv(user_idFile)
	countVect = pd.read_csv(countVec)
	countVect['user_id'] = text['user_id']
	countVec2 = countVect.groupby(['user_id']).mean().reset_index()
	return countVec2

In [4]:
path = '/Users/lucia/phd_work/Clpsy/'
#path = '/home/lucia/phd_work/shareTask/'

#merge features
features = path + 'suicideDetection/features/FreqSentiMotiTopiFea.csv'
liwc = path + 'suicideDetection/features/liwcSW.csv'
empath = path + 'suicideDetection/features/empathSW.csv'
allfea = mergeFea(features, liwc, empath)

#select features and split train test
X = allfea.iloc[:, 3:146]
y = allfea.raw_label


In [13]:
##add count vect

user_idFile = path + 'data/clpsych19_training_data/Btrain_NoNoise_SW.csv'
countVec = path + 'countVec2.csv'
countVec2 = getCountVect(user_idFile, countVec)
#add tfidf count vect as features
allfea = pd.merge(allfea, countVec2, on = 'user_id', how = 'right')

y = allfea.raw_label
y = y.replace(['a', 'b', 'c', 'd'], [1, 2, 2, 2]) 
allfea2 = allfea.iloc[:, 1::]
allfea3 = allfea2.drop(['raw_label'],axis = 1)
#allfea3 = allfea2.drop(['Unnamed: 0_y'],axis = 1)
allfea3 = allfea3.drop(['user_id'],axis = 1)
#RF Select feature
# feaImp = RFfeatureSel(allfea3,y)
feaImp = pd.read_csv(path + 'topFea.csv')
selectFea = selectFea(feaImp,200) 

In [11]:

def SVMPredictTest(X_train, y_train, X_test):

	smote_enn = SMOTEENN(random_state=42)
	cv = StratifiedKFold(n_splits=5, random_state = 0)
	svc = make_pipeline(smote_enn, StandardScaler(), svm.SVC())
	parameters = [{'svc__kernel': ['linear', 'poly', 'rbf', 'sigmoid'], 'svc__gamma': [0.01, 0.001, 0.0001],
	                     'svc__C':[0.1, 0.3, 0.5, 0.7, 0.9, 1.0, 1.5, 2.0, 10] , 'svc__class_weight':['balanced']}]
	                    

	grid_search_item = GridSearchCV(estimator = svc,
	                          param_grid = parameters,
	                           cv =  cv,
	                           scoring = 'accuracy',
	                           n_jobs = -1)
	grid_search = grid_search_item.fit(X_train, y_train)

	#y_pred = grid_search.predict(X_train)
	#print(classification_report(y_train, y_pred))

	print('Best scores and best parameters')
	print(grid_search.best_score_)
	print(grid_search.best_params_)

	y_pred = grid_search.predict(X_test)

	return y_pred
	

In [14]:
print('get test set')
featuresT = path + 'suicideDetection/TestFeatures/FreqSentiMotiTopiFea.csv'
liwcT = path + 'suicideDetection/TestFeatures/liwcSW.csv'
empathT = path + 'suicideDetection/TestFeatures/empathSW.csv'
testFea = mergeFea(featuresT, liwcT, empathT)

user_idFile = path + 'data/clpsych19_training_data/testSW.csv'
countVec = path + 'suicideDetection/TestFeatures/countVec2.csv'
countVec2 = getCountVect(user_idFile, countVec)
#add tfidf count vect as features


allfeaTest = pd.merge(testFea, countVec2, on = 'user_id', how = 'right')
selectedFeaTest = allfeaTest.loc[:, allfeaTest.columns.isin(selectFea)]
selectedFeaTest['user_id'] = allfeaTest.user_id
selectedFeaTest.to_csv(path + 'suicideDetection/TestFeatures/selectFeaTest.csv')

selectedFeaX = allfea2.loc[:, allfea2.columns.isin(selectedFeaTest.columns)]
selectedFeaX['user_id'] = allfea2.user_id
print('trainset and testset shape', selectedFeaX.shape, selectedFeaTest.shape)


print('prediction')
#prediction
result = SVMPredictTest(selectedFeaX, y, selectedFeaTest)
print(result)


get test set
trainset and testset shape (496, 181) (125, 181)
prediction


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Best scores and best parameters
0.8346774193548387
{'svc__C': 2.0, 'svc__class_weight': 'balanced', 'svc__gamma': 0.0001, 'svc__kernel': 'rbf'}
[2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2]


  Xt = transform.transform(Xt)
