In [1]:
%matplotlib inline
import pandas as pd
import numpy as np
import seaborn as sns
import sqldf
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import  OneHotEncoder, StandardScaler, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression, LinearRegression, LogisticRegressionCV, RidgeClassifierCV
from sklearn.metrics import accuracy_score, confusion_matrix, r2_score, classification_report, f1_score
from sklearn.feature_selection import f_regression 
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold, train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.filterwarnings(action='ignore')
np.set_printoptions(precision=4)

In [2]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.width', 1000)

In [3]:
# without adding country
df1 = pd.read_csv("D:\DataScience\DataSet\Fraud_Data.csv")
df1.describe(include='all')


Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
count,151112.0,151112,151112,151112.0,151112,151112,151112,151112,151112.0,151112.0,151112.0
unique,,151112,150679,,137956,3,5,2,,,
top,,2015-04-24 07:59:36,2015-09-10 09:04:53,,NGQCKIADMZORL,SEO,Chrome,M,,,
freq,,1,3,,20,60615,61432,88293,,,
mean,200171.04097,,,36.935372,,,,,33.140704,2152145000.0,0.093646
std,115369.285024,,,18.322762,,,,,8.617733,1248497000.0,0.291336
min,2.0,,,9.0,,,,,18.0,52093.5,0.0
25%,100642.5,,,22.0,,,,,27.0,1085934000.0,0.0
50%,199958.0,,,35.0,,,,,33.0,2154770000.0,0.0
75%,300054.0,,,49.0,,,,,39.0,3243258000.0,0.0


In [4]:
df1.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [5]:
print(df1.shape)

(151112, 11)


In [6]:
good_bye_list = ['user_id','signup_time', 'purchase_time', 'device_id', 'ip_address']
df1.drop(good_bye_list, axis=1, inplace=True)
df1.head()

Unnamed: 0,purchase_value,source,browser,sex,age,class
0,34,SEO,Chrome,M,39,0
1,16,Ads,Chrome,F,53,0
2,15,SEO,Opera,M,53,1
3,44,SEO,Safari,M,41,0
4,39,Ads,Safari,M,45,0


In [7]:
df1['sex'].replace(['M', 'F'],[0,1], inplace = True)
df1.head()

Unnamed: 0,purchase_value,source,browser,sex,age,class
0,34,SEO,Chrome,0,39,0
1,16,Ads,Chrome,1,53,0
2,15,SEO,Opera,0,53,1
3,44,SEO,Safari,0,41,0
4,39,Ads,Safari,0,45,0


Variables-cible : Y = 'class'

Variables explicatives : 'purchase_value', 'source', 'browser', 'sex', 'age'

Nombre d''exemples : 151112

Imputation de valeurs manquantes : None 
from sklearn.preprocessing import Imputer)

Normalisation de 0 à 1(quantitatif): 'purchase_value', 'sex', 'age'

from sklearn.preprocessing import OneHotEncoder (multiple quantitatives values, ex: country), StandardScaler, LabelEncoder (2 values, ex: yes, no)

Encodage des variables catégorielles de 0 à 1: 'browser', 'source'
from sklearn.preprocessing import OneHotEncoder (multiple quantitative value), StandardScaler, LabelEncoder


In [10]:
#separating X and y
print("Separating labels from features...")
features_list = ['purchase_value', 'source', 'browser', 'sex', 'age']
target_variable = 'class'

X = df1.loc[:,features_list]
y = df1.loc[:,target_variable]

print("...Done.")
print()

print('y : ')
print(y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

y : 
0    0
1    0
2    1
3    0
4    0
Name: class, dtype: int64

X :
   purchase_value source browser  sex  age
0              34    SEO  Chrome    0   39
1              16    Ads  Chrome    1   53
2              15    SEO   Opera    0   53
3              44    SEO  Safari    0   41
4              39    Ads  Safari    0   45


In [11]:
idx = 0
numeric_features = []
numeric_indices = []
categorical_features = []
categorical_indices = []
for i,t in X.dtypes.iteritems():
  if ('float' in str(t)) or ('int' in str(t)) :
    numeric_features.append(i)
    numeric_indices.append(idx)
  else :
    categorical_features.append(i)
    categorical_indices.append(idx)

  idx = idx + 1

print('Found numeric features ', numeric_features,' at positions ', numeric_indices)
print('Found categorical features ', categorical_features,' at positions ', categorical_indices)

Found numeric features  ['purchase_value', 'sex', 'age']  at positions  [0, 3, 4]
Found categorical features  ['source', 'browser']  at positions  [1, 2]


In [12]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

print()
print(X_train.head())
print(X_test.head())
print(y_train.head())
print(y_test.head())
print()
print(X_train.shape)
print(X_test.shape)
print(y_train.shape[0])
print(y_test.shape[0])


        purchase_value  source browser  sex  age
43147               36     Ads  Chrome    0   25
115663              45  Direct      IE    0   41
15503               90     SEO  Safari    1   27
70481               69  Direct  Chrome    1   30
126169              35     SEO  Chrome    1   35
        purchase_value  source browser  sex  age
26612               63     Ads      IE    1   25
90677               61     SEO      IE    0   45
139694              51     Ads  Safari    0   35
128232              50     Ads  Chrome    1   33
22815               39  Direct  Chrome    0   22
43147     0
115663    0
15503     0
70481     0
126169    0
Name: class, dtype: int64
26612     0
90677     0
139694    0
128232    0
22815     1
Name: class, dtype: int64

(120889, 5)
(30223, 5)
120889
30223


In [13]:
print("Convert pandas DataFrames to numpy arrays...")
X_train = X_train.values
X_test = X_test.values
y_train = y_train.tolist()
y_test = y_test.tolist()
print("...Done")

print(X_train[0:5,:])
print(X_test[0:2,:])
print()
print(y_train[0:5])
print(y_test[0:2])

Convert pandas DataFrames to numpy arrays...
...Done
[[36 'Ads' 'Chrome' 0 25]
 [45 'Direct' 'IE' 0 41]
 [90 'SEO' 'Safari' 1 27]
 [69 'Direct' 'Chrome' 1 30]
 [35 'SEO' 'Chrome' 1 35]]
[[63 'Ads' 'IE' 1 25]
 [61 'SEO' 'IE' 0 45]]

[0, 0, 0, 0, 0]
[0, 0]


In [14]:
print("Encoding categorical features and standardizing numerical features...")
print()
print(X_train[0:5,:])
print(X_test[0:5,:])

# Instantiate Normalization (valeur num de 0 à 1)
numeric_transformer = StandardScaler()

# Instantiate OHE (variables catégorielles en 0 et 1 multiples colonnes)
categorical_transformer = OneHotEncoder(drop='first')

#encoding
featureencoder = ColumnTransformer( transformers=[ ('cat', categorical_transformer, categorical_indices),   
                                                   ('num', numeric_transformer, numeric_indices)])

X_train = featureencoder.fit_transform(X_train)
X_test = featureencoder.transform(X_test)

print("...Done")
print(X_train[0:5,:])
print()
print(X_test[0:5,:])

Encoding categorical features and standardizing numerical features...

[[36 'Ads' 'Chrome' 0 25]
 [45 'Direct' 'IE' 0 41]
 [90 'SEO' 'Safari' 1 27]
 [69 'Direct' 'Chrome' 1 30]
 [35 'SEO' 'Chrome' 1 35]]
[[63 'Ads' 'IE' 1 25]
 [61 'SEO' 'IE' 0 45]
 [51 'Ads' 'Safari' 0 35]
 [50 'Ads' 'Chrome' 1 33]
 [39 'Direct' 'Chrome' 0 22]]
...Done
[[ 0.      0.      0.      0.      0.      0.     -0.0514 -0.8452 -0.9465]
 [ 1.      0.      0.      1.      0.      0.      0.44   -0.8452  0.9125]
 [ 0.      1.      0.      0.      0.      1.      2.8967  1.1832 -0.7142]
 [ 1.      0.      0.      0.      0.      0.      1.7502  1.1832 -0.3656]
 [ 0.      1.      0.      0.      0.      0.     -0.106   1.1832  0.2153]]

[[ 0.      0.      0.      1.      0.      0.      1.4226  1.1832 -0.9465]
 [ 0.      1.      0.      1.      0.      0.      1.3135 -0.8452  1.3772]
 [ 0.      0.      0.      0.      0.      1.      0.7675 -0.8452  0.2153]
 [ 0.      0.      0.      0.      0.      0.      0.7129  1

In [15]:
#logistic regression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

#score logistic regression
print('Score on training set : ', classifier.score(X_train, y_train))
print('Score on testing set : ', classifier.score(X_test, y_test))

y_train_pred_classifier = classifier.predict(X_train)
y_test_pred_classifier = classifier.predict(X_test)
print()
print("f1_score on train set : ", f1_score(y_train, y_train_pred_classifier))
print("f1_score on test set : ", f1_score(y_test, y_test_pred_classifier))
print()

Score on training set :  0.9067905268469423
Score on testing set :  0.9046090725606326

f1_score on train set :  0.0
f1_score on test set :  0.0



In [16]:
print("Confusion matrix on train set : ")
print(confusion_matrix(y_train, y_train_pred_classifier))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(y_test, y_test_pred_classifier))
print()


Confusion matrix on train set : 
[[109621      0]
 [ 11268      0]]

Confusion matrix on test set : 
[[27340     0]
 [ 2883     0]]



In [17]:
#bayes default decisiontree
import sklearn.naive_bayes as nb
naivebayes = nb.GaussianNB()
naivebayes.fit(X_train, y_train)

#score bayes
print('Score on training set : ', naivebayes.score(X_train, y_train))
print('Score on testing set : ', naivebayes.score(X_test, y_test))

y_train_pred_naivebayes = naivebayes.predict(X_train)
y_test_pred_naivebayes = naivebayes.predict(X_test)
print()
print("f1_score on train set : ", f1_score(y_train, y_train_pred_naivebayes))
print("f1_score on test set : ", f1_score(y_test, y_test_pred_naivebayes))
print()

Score on training set :  0.9067905268469423
Score on testing set :  0.9046090725606326

f1_score on train set :  0.0
f1_score on test set :  0.0



In [18]:
print("Confusion matrix on train set : ")
print(confusion_matrix(y_train, y_train_pred_classifier))
print()
print("Confusion matrix on test set : ")
print(confusion_matrix(y_test, y_test_pred_classifier))
print()

Confusion matrix on train set : 
[[109621      0]
 [ 11268      0]]

Confusion matrix on test set : 
[[27340     0]
 [ 2883     0]]



In [19]:
#logistic regression
classifier = LogisticRegressionCV(cv=30, verbose=3  , random_state=0, n_jobs=-1, max_iter=400)
classifier.fit(X_train, y_train)

#score logistic regression
print('Score on training set : ', classifier.score(X_train, y_train))
print('Score on testing set : ', classifier.score(X_test, y_test))

y_train_pred_classifier = classifier.predict(X_train)
y_test_pred_classifier = classifier.predict(X_test)
print()
print("f1_score on train set : ", f1_score(y_train, y_train_pred_classifier))
print("f1_score on test set : ", f1_score(y_test, y_test_pred_classifier))
print()

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 out of  30 | elapsed:   12.4s remaining:    1.8s
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed:   12.7s finished


Score on training set :  0.9067905268469423
Score on testing set :  0.9046090725606326

f1_score on train set :  0.0
f1_score on test set :  0.0



In [79]:
#with country mapping
# without adding country
df1 = pd.read_csv("D:\DataScience\DataSet\Fraud_Data.csv")
df2 = pd.read_csv("D:\DataScience\DataSet\IpAddress_to_Country.csv")
#df1 = pd.concat([df1,df2])
#df1.describe(include='all')

In [81]:
# function that takes an IP address as argument and 
# returns country associated based on IP_table

def ip_to_country(ip=None):
    try :
        return df2.country[(df2.lower_bound_ip_address <= ip) 
                             & (df2.upper_bound_ip_address >= ip)].iloc[0]
    except IndexError :
        return "Unknown"     # To affect a country to each IP :
df1["ip_country"] = df1.ip_address.apply(ip_to_country)

In [82]:
df1.to_csv("D:\DataScience\DataSet\Fraud_Data_clean.csv", index=False)

In [95]:
# without adding country
df = pd.read_csv("D:\DataScience\DataSet\Fraud_Data_clean.csv")
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_country
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,Unknown
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States


In [213]:
#good_bye_list = ['Unnamed: 0']
#df1.drop(good_bye_list, axis=1, inplace=True)

In [84]:
df.columns

Index(['user_id', 'signup_time', 'purchase_time', 'purchase_value', 'device_id', 'source', 'browser', 'sex', 'age', 'ip_address', 'class', 'ip_country'], dtype='object')

In [96]:
def clean_up_data_frame():
    '''
    Add features to data frame,
    Remove unused features from dataframe
    '''
    purchase = df['purchase_time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    signup = df['signup_time'].apply(lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))
    df['time_to_purchase'] = purchase - signup
    df['time_to_purchase'] = df['time_to_purchase'].apply(lambda x: x.days)
    

In [97]:
from datetime import datetime
clean_up_data_frame()

In [87]:
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_country,time_to_purchase
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0,Japan,52
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0,United States,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1,United States,0
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0,Unknown,5
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0,United States,50


In [98]:
df['sex'].replace(['M', 'F'],[0,1], inplace = True)
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class,ip_country,time_to_purchase
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,0,39,732758400.0,0,Japan,52
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,1,53,350311400.0,0,United States,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,0,53,2621474000.0,1,United States,0
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,0,41,3840542000.0,0,Unknown,5
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,0,45,415583100.0,0,United States,50


In [93]:
good_bye_list = ['user_id', 'signup_time', 'purchase_time', 'device_id', 'ip_address', 'ip_country']
df.drop(good_bye_list, axis=1, inplace=True)
df.head()

Unnamed: 0,purchase_value,source,browser,sex,age,time_to_purchase
0,34,SEO,Chrome,0,39,52
1,16,Ads,Chrome,1,53,0
2,15,SEO,Opera,0,53,0
3,44,SEO,Safari,0,41,5
4,39,Ads,Safari,0,45,50


Variables-cible : Y = 'class'

Variables explicatives : 'purchase_value', 'source', 'browser', 'sex', 'age', 'time_to_purchase'

Nombre d''exemples : 151112

Imputation de valeurs manquantes : None 
from sklearn.preprocessing import Imputer)

Normalisation de 0 à 1(quantitatif): 'purchase_value', 'sex', 'age', 'time-to-purchase'

from sklearn.preprocessing import OneHotEncoder (multiple quantitatives values, ex: country), StandardScaler, LabelEncoder (2 values, ex: yes, no)

Encodage des variables catégorielles de 0 à 1: 'browser', 'source'
from sklearn.preprocessing import OneHotEncoder (multiple quantitative value), StandardScaler, LabelEncoder

In [99]:
#separating X and y
print("Separating labels from features...")
features_list = ['purchase_value', 'source', 'browser', 'sex', 'age', 'time_to_purchase']
target_variable = 'class'

X = df.loc[:,features_list]
y = df.loc[:,target_variable]

print("...Done.")
print()

print('y : ')
print(y.head())
print()
print('X :')
print(X.head())

Separating labels from features...
...Done.

y : 
0    0
1    0
2    1
3    0
4    0
Name: class, dtype: int64

X :
   purchase_value source browser  sex  age  time_to_purchase
0              34    SEO  Chrome    0   39                52
1              16    Ads  Chrome    1   53                 0
2              15    SEO   Opera    0   53                 0
3              44    SEO  Safari    0   41                 5
4              39    Ads  Safari    0   45                50


In [100]:
idx = 0
numeric_features = []
numeric_indices = []
categorical_features = []
categorical_indices = []
for i,t in X.dtypes.iteritems():
  if ('float' in str(t)) or ('int' in str(t)) :
    numeric_features.append(i)
    numeric_indices.append(idx)
  else :
    categorical_features.append(i)
    categorical_indices.append(idx)

  idx = idx + 1

print('Found numeric features ', numeric_features,' at positions ', numeric_indices)
print('Found categorical features ', categorical_features,' at positions ', categorical_indices)

Found numeric features  ['purchase_value', 'sex', 'age', 'time_to_purchase']  at positions  [0, 3, 4, 5]
Found categorical features  ['source', 'browser']  at positions  [1, 2]


In [101]:
#split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

print()
print(X_train.head())
print(X_test.head())
print(y_train.head())
print(y_test.head())
print()
print(X_train.shape)
print(X_test.shape)
print(y_train.shape[0])
print(y_test.shape[0])


        purchase_value  source browser  sex  age  time_to_purchase
43147               36     Ads  Chrome    0   25               108
115663              45  Direct      IE    0   41                13
15503               90     SEO  Safari    1   27                27
70481               69  Direct  Chrome    1   30                61
126169              35     SEO  Chrome    1   35                 9
        purchase_value  source browser  sex  age  time_to_purchase
26612               63     Ads      IE    1   25                22
90677               61     SEO      IE    0   45                56
139694              51     Ads  Safari    0   35                93
128232              50     Ads  Chrome    1   33                34
22815               39  Direct  Chrome    0   22                 0
43147     0
115663    0
15503     0
70481     0
126169    0
Name: class, dtype: int64
26612     0
90677     0
139694    0
128232    0
22815     1
Name: class, dtype: int64

(120889, 6)
(30223, 6)

In [102]:
print("Convert pandas DataFrames to numpy arrays...")
X_train = X_train.values
X_test = X_test.values
y_train = y_train.tolist()
y_test = y_test.tolist()
print("...Done")

print(X_train[0:5,:])
print(X_test[0:2,:])
print()
print(y_train[0:5])
print(y_test[0:2])

Convert pandas DataFrames to numpy arrays...
...Done
[[36 'Ads' 'Chrome' 0 25 108]
 [45 'Direct' 'IE' 0 41 13]
 [90 'SEO' 'Safari' 1 27 27]
 [69 'Direct' 'Chrome' 1 30 61]
 [35 'SEO' 'Chrome' 1 35 9]]
[[63 'Ads' 'IE' 1 25 22]
 [61 'SEO' 'IE' 0 45 56]]

[0, 0, 0, 0, 0]
[0, 0]


In [130]:
print("Encoding categorical features and standardizing numerical features...")
print()
print(X_train[0:5,:])
print(X_test[0:5,:])

# Normalization (valeur num de 0 à 1)
numeric_transformer = StandardScaler()

# OHE / dummyfication (variables catégorielles en 0 et 1 multiples colonnes)
categorical_transformer = OneHotEncoder(drop='first')

featureencoder = ColumnTransformer( transformers=[ ('cat', categorical_transformer, categorical_indices),   
                                                   ('num', numeric_transformer, numeric_indices)])

X_train = featureencoder.fit_transform(X_train)
X_test = featureencoder.transform(X_test)

print("...Done")
print(X_train[0:5,:])
print()
print(X_test[0:5,:])

Encoding categorical features and standardizing numerical features...

[[ 0.      0.      0.      0.      0.      0.     -0.0514 -0.8452 -0.9465]
 [ 1.      0.      0.      1.      0.      0.      0.44   -0.8452  0.9125]
 [ 0.      1.      0.      0.      0.      1.      2.8967  1.1832 -0.7142]
 [ 1.      0.      0.      0.      0.      0.      1.7502  1.1832 -0.3656]
 [ 0.      1.      0.      0.      0.      0.     -0.106   1.1832  0.2153]]
[[ 0.      0.      0.      1.      0.      0.      1.4226  1.1832 -0.9465]
 [ 0.      1.      0.      1.      0.      0.      1.3135 -0.8452  1.3772]
 [ 0.      0.      0.      0.      0.      1.      0.7675 -0.8452  0.2153]
 [ 0.      0.      0.      0.      0.      0.      0.7129  1.1832 -0.017 ]
 [ 1.      0.      0.      0.      0.      0.      0.1124 -0.8452 -1.2951]]
...Done
[[ 0.      0.     -0.5053 -0.5664 -0.1575]
 [ 0.      0.      1.9791  1.7655 -0.1575]
 [ 1.      0.     -0.5053 -0.5664 -0.1575]
 [ 0.      0.      1.9791 -0.5664 -0.157

In [131]:
#logistic regression
classifier = LogisticRegression()
classifier.fit(X_train, y_train)

#score logistic regression
print('Score on training set : ', classifier.score(X_train, y_train))
print('Score on testing set : ', classifier.score(X_test, y_test))

y_train_pred_classifier = classifier.predict(X_train)
y_test_pred_classifier = classifier.predict(X_test)
print()
print("f1_score on train set : ", f1_score(y_train, y_train_pred_classifier))
print("f1_score on test set : ", f1_score(y_test, y_test_pred_classifier))
print()

Score on training set :  0.9067905268469423
Score on testing set :  0.9046090725606326

f1_score on train set :  0.0
f1_score on test set :  0.0



In [122]:
y_train_pred_classifier = classifier.predict(X_train)
y_test_pred_classifier = classifier.predict(X_test)
print()
print("f1_score on train set : ", f1_score(y_train, y_train_pred_classifier))
print("f1_score on test set : ", f1_score(y_test, y_test_pred_classifier))
print()


f1_score on train set :  0.0
f1_score on test set :  0.0



In [129]:
y_train_pred_classifier[:1000]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [125]:
y_train_pred_classifier[:1000]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,