In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import log_loss
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
#from sklearn.metrics import plot_confusion_matrix

from imblearn.over_sampling import RandomOverSampler, SMOTE
from imblearn.under_sampling import RandomUnderSampler

  from numpy.core.umath_tests import inner1d


# Data Cleaning

In [2]:
data = pd.read_csv('avg_survey_data.csv')
data.head()

Unnamed: 0,Timestamp,"Before we begin, what's your age (in years)?",Please assign a number 0-10 for each priority. [Your commitment to your relationship with your partner],Please assign a number 0-10 for each priority. [Your physical distance to your partner ],Please assign a number 0-10 for each priority. [Your career],Please assign a number 0-10 for each priority. [Your partner’s career],Would you...,Please assign a number 0-10 for each priority. [Your commitment to your relationship with your partner].1,Please assign a number 0-10 for each priority. [Your physical proximity to your partner],Please assign a number 0-10 for each priority. [Your career].2,...,Please assign a number 0-10 for each priority. [Your physical proximity to your partner].1,Please assign a number 0-10 for each priority. [Your career].1,Please assign a number 0-10 for each priority. [Your partner’s career].1,Please assign a number 0-10 for each priority. [Communication/contact with your partner].1,Please assign a number 0-10 for each priority. [Your time].1,Please assign a number 0-10 for each priority. [Your partner's time].1,Please assign a number 0-10 for each priority. [Your sense of security in your relationship].1,Please assign a number 0-10 for each priority. [Your family's/friends' satisfaction ],Please assign a number 0-10 for each priority. [Your personal desire to get married],Would you....2
0,3/17/2021 1:47:01,18,3.0,1.0,6.0,1.0,Continue to commit to the relationship,5.0,3.0,5.0,...,4,7.0,5,7.0,7.0,5.0,6.0,7.0,7.0,Suggest the idea of marriage to your partner a...
1,3/21/2021 21:36:07,14,7.0,1.0,10.0,8.0,Continue to commit to the relationship,8.0,1.0,10.0,...,3,10.0,8,8.0,7.0,7.0,8.0,1.0,1.0,Not bring up the idea of marriage to your part...
2,3/21/2021 21:45:46,13,8.0,6.0,8.0,8.0,Continue to commit to the relationship,9.0,6.0,9.0,...,6,8.0,8,9.0,8.0,8.0,9.0,6.0,7.0,Subtly hint at the idea of marriage as a next ...
3,3/21/2021 21:47:26,13,10.0,6.0,10.0,10.0,Continue to commit to the relationship,10.0,5.0,10.0,...,5,10.0,10,6.0,9.0,9.0,9.0,3.0,10.0,Suggest the idea of marriage to your partner a...
4,3/21/2021 22:04:23,13,10.0,3.0,7.0,8.0,Continue to commit to the relationship,10.0,3.0,7.0,...,3,7.0,8,6.0,7.0,8.0,10.0,5.0,6.0,Suggest the idea of marriage to your partner a...


In [3]:
data = data.drop(['Timestamp'], axis = 1)
columns = data.columns
data = data.rename(columns = {columns[0]: 'age', 
                              columns[1]: 's1_commitment', 
                              columns[2]: 's1_distance', 
                              columns[3]: 's1_career',
                              columns[4]: 's1_partner_career', 
                              columns[5]: 's1_decision', 
                              columns[6]: 's2_commitment',
                              columns[7] : 's2_distance',
                              columns[8]: 's2_career',
                              columns[9]: 's2_partner_career',
                              columns[10]: 's2_communication',
                              columns[11]: 's2_your_time',
                              columns[12]: 's2_partner_time',
                              columns[13]: 's2_security',
                              columns[14]: 's2_compromise',
                              columns[15]: 's2_decision',
                              columns[16]: 's3_commitment',
                             columns[17] : 's3_distance',
                             columns[18]: 's3_career',
                             columns[19]: 's3_partner_career',
                             columns[20]: 's3_communication',
                             columns[21]: 's3_your_time',
                             columns[22]: 's3_partner_time',
                              columns[23]: 's3_security',
                             columns[24]: 's3_family',
                             columns[25]: 's3_marriage',
                             columns[26]: 's3_decision'})
data.head()

Unnamed: 0,age,s1_commitment,s1_distance,s1_career,s1_partner_career,s1_decision,s2_commitment,s2_distance,s2_career,s2_partner_career,...,s3_distance,s3_career,s3_partner_career,s3_communication,s3_your_time,s3_partner_time,s3_security,s3_family,s3_marriage,s3_decision
0,18,3.0,1.0,6.0,1.0,Continue to commit to the relationship,5.0,3.0,5.0,3.0,...,4,7.0,5,7.0,7.0,5.0,6.0,7.0,7.0,Suggest the idea of marriage to your partner a...
1,14,7.0,1.0,10.0,8.0,Continue to commit to the relationship,8.0,1.0,10.0,8.0,...,3,10.0,8,8.0,7.0,7.0,8.0,1.0,1.0,Not bring up the idea of marriage to your part...
2,13,8.0,6.0,8.0,8.0,Continue to commit to the relationship,9.0,6.0,9.0,9.0,...,6,8.0,8,9.0,8.0,8.0,9.0,6.0,7.0,Subtly hint at the idea of marriage as a next ...
3,13,10.0,6.0,10.0,10.0,Continue to commit to the relationship,10.0,5.0,10.0,10.0,...,5,10.0,10,6.0,9.0,9.0,9.0,3.0,10.0,Suggest the idea of marriage to your partner a...
4,13,10.0,3.0,7.0,8.0,Continue to commit to the relationship,10.0,3.0,7.0,8.0,...,3,7.0,8,6.0,7.0,8.0,10.0,5.0,6.0,Suggest the idea of marriage to your partner a...


In [4]:
# Turning decision alternatives --> numerical classes
data = data.replace({'s1_decision': {'Continue to commit to the relationship': 1, 
                                         'Suggest to break-up with your partner': 0}})
data = data.replace({'s2_decision': {'Accept their proposal to temporarily put a hiatus on communication within your relationship': 0, 
                                         'Decline their proposal and suggest communicating a lot less than before (perhaps once a week or month) rather than eliminating communication completely': 1,
                                         'Decline their proposal & suggest to break up with your partner': 2}})
data = data.replace({'s3_decision': {'Suggest the idea of marriage to your partner and potentially propose/be proposed to since you are both happy and stable in the relationship': 0, 
                                         'Not bring up the idea of marriage to your partner and wait for them to bring it up one day, sooner or later, instead; dismiss your family and friends’ suggestions for the time-being': 1,
                                         'Subtly hint at the idea of marriage as a next step but not directly address this with your partner since you are both happy and stable in the relationship': 2,
                                         'Propose to break up with your partner and move on': 3}})
data.head()

Unnamed: 0,age,s1_commitment,s1_distance,s1_career,s1_partner_career,s1_decision,s2_commitment,s2_distance,s2_career,s2_partner_career,...,s3_distance,s3_career,s3_partner_career,s3_communication,s3_your_time,s3_partner_time,s3_security,s3_family,s3_marriage,s3_decision
0,18,3.0,1.0,6.0,1.0,1,5.0,3.0,5.0,3.0,...,4,7.0,5,7.0,7.0,5.0,6.0,7.0,7.0,0
1,14,7.0,1.0,10.0,8.0,1,8.0,1.0,10.0,8.0,...,3,10.0,8,8.0,7.0,7.0,8.0,1.0,1.0,1
2,13,8.0,6.0,8.0,8.0,1,9.0,6.0,9.0,9.0,...,6,8.0,8,9.0,8.0,8.0,9.0,6.0,7.0,2
3,13,10.0,6.0,10.0,10.0,1,10.0,5.0,10.0,10.0,...,5,10.0,10,6.0,9.0,9.0,9.0,3.0,10.0,0
4,13,10.0,3.0,7.0,8.0,1,10.0,3.0,7.0,8.0,...,3,7.0,8,6.0,7.0,8.0,10.0,5.0,6.0,0


In [5]:
#converting all cell values to numeric float
data = data.astype(float)

# Scenario 1

In [6]:
data_1 = data.iloc[:, 0:6]
data_1.head(10)

Unnamed: 0,age,s1_commitment,s1_distance,s1_career,s1_partner_career,s1_decision
0,18.0,3.0,1.0,6.0,1.0,1.0
1,14.0,7.0,1.0,10.0,8.0,1.0
2,13.0,8.0,6.0,8.0,8.0,1.0
3,13.0,10.0,6.0,10.0,10.0,1.0
4,13.0,10.0,3.0,7.0,8.0,1.0
5,14.0,4.0,4.0,3.0,4.0,1.0
6,14.0,7.0,5.0,6.0,7.0,1.0
7,13.0,10.0,6.0,9.0,10.0,1.0
8,14.0,3.0,3.0,3.0,2.0,1.0
9,14.0,10.0,10.0,10.0,10.0,1.0


# Scenario 2

In [7]:
data_2 = data[['age', 's2_commitment','s2_distance','s2_career','s2_partner_career','s2_communication',
               's2_your_time','s2_partner_time','s2_security','s2_compromise','s2_decision']]
data_2.head(10)

Unnamed: 0,age,s2_commitment,s2_distance,s2_career,s2_partner_career,s2_communication,s2_your_time,s2_partner_time,s2_security,s2_compromise,s2_decision
0,18.0,5.0,3.0,5.0,3.0,5.0,6.0,2.5,4.0,4.0,0.0
1,14.0,8.0,1.0,10.0,8.0,9.0,8.0,8.0,8.0,8.0,1.0
2,13.0,9.0,6.0,9.0,9.0,6.5,8.0,8.0,9.0,8.0,0.0
3,13.0,10.0,5.0,10.0,10.0,8.0,9.0,9.0,8.0,9.0,1.0
4,13.0,10.0,3.0,7.0,8.0,6.0,7.0,8.0,10.0,10.0,0.0
5,14.0,3.0,3.0,2.0,3.0,3.0,3.0,3.0,2.0,3.0,0.0
6,14.0,7.0,5.0,6.0,6.0,9.0,7.0,7.0,6.0,5.0,2.0
7,13.0,9.5,5.0,10.0,10.0,8.0,10.0,10.0,10.0,10.0,0.0
8,14.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,2.0,2.0,1.0
9,14.0,10.0,9.0,10.0,10.0,10.0,9.0,10.0,10.0,10.0,1.0


# Scenario 3

In [8]:
data_3 = data[['age', 's3_commitment','s3_distance','s3_career','s3_partner_career','s3_communication',
               's3_your_time','s3_partner_time','s3_security', 's3_family','s3_marriage','s3_decision']]
data_3.head()

Unnamed: 0,age,s3_commitment,s3_distance,s3_career,s3_partner_career,s3_communication,s3_your_time,s3_partner_time,s3_security,s3_family,s3_marriage,s3_decision
0,18.0,6.0,4.0,7.0,5.0,7.0,7.0,5.0,6.0,7.0,7.0,0.0
1,14.0,8.0,3.0,10.0,8.0,8.0,7.0,7.0,8.0,1.0,1.0,1.0
2,13.0,8.0,6.0,8.0,8.0,9.0,8.0,8.0,9.0,6.0,7.0,2.0
3,13.0,10.0,5.0,10.0,10.0,6.0,9.0,9.0,9.0,3.0,10.0,0.0
4,13.0,10.0,3.0,7.0,8.0,6.0,7.0,8.0,10.0,5.0,6.0,0.0


In [9]:
# Normalizes the priority values for each row in sub-dataframe. 
# Returns a DF with normalized priority data values.
def normalize(data): 
    data_copy = data.copy()
    indices = data_copy.index
    
    for row in indices: 
        values = data_copy.loc[row, :].values[1:len(data_copy.columns)-1]
        total = sum(values)
        norm_values = (values / total) * 100
        data_copy.loc[row, 1:len(data_copy.columns)-1] = norm_values
    return data_copy

In [10]:
#Normalize and round data to 2 decimal places.
norm_data1 = normalize(data_1).round(2)
norm_data2 = normalize(data_2).round(2)
norm_data3 = normalize(data_3).round(2)

# Train / Test Split

In [11]:
#scenario 1
#split by classes, then split each class into train and test; merge train and test dfs

s1_class0 = norm_data1[norm_data1['s1_decision'] == 0]
s1_class1 = norm_data1[norm_data1['s1_decision'] == 1]

X1_class0 = s1_class0.iloc[:, 0:5]
y1_class0 = s1_class0['s1_decision']
X1_class0_train, X1_class0_test, y1_class0_train, y1_class0_test = train_test_split(X1_class0, y1_class0, test_size=0.20, random_state=42)

X1_class1 = s1_class1.iloc[:, 0:5]
y1_class1 = s1_class1['s1_decision']
X1_class1_train, X1_class1_test, y1_class1_train, y1_class1_test = train_test_split(X1_class1, y1_class1, test_size=0.20, random_state=42)


X1_train = X1_class0_train.append(X1_class1_train)
y1_train = y1_class0_train.append(y1_class1_train)
X1_test = X1_class0_test.append(X1_class1_test)
y1_test = y1_class0_test.append(y1_class1_test)

In [12]:
#scenario 2
s2_class0 = norm_data2[norm_data2['s2_decision'] == 0]
s2_class1 = norm_data2[norm_data2['s2_decision'] == 1]
s2_class2 = norm_data2[norm_data2['s2_decision'] == 2]

X2_class0 = s2_class0.iloc[:, 0:10]
y2_class0 = s2_class0['s2_decision']
X2_class0_train, X2_class0_test, y2_class0_train, y2_class0_test = train_test_split(X2_class0, y2_class0, test_size=0.20, random_state=42)

X2_class1 = s2_class1.iloc[:, 0:10]
y2_class1 = s2_class1['s2_decision']
X2_class1_train, X2_class1_test, y2_class1_train, y2_class1_test = train_test_split(X2_class1, y2_class1, test_size=0.20, random_state=42)

X2_class2 = s2_class2.iloc[:, 0:10]
y2_class2 = s2_class2['s2_decision']
X2_class2_train, X2_class2_test, y2_class2_train, y2_class2_test = train_test_split(X2_class2, y2_class2, test_size=0.20, random_state=42)


X2_train = X2_class0_train.append(X2_class1_train).append(X2_class2_train)
y2_train = y2_class0_train.append(y2_class1_train).append(y2_class2_train)
X2_test = X2_class0_test.append(X2_class1_test).append(X2_class2_test)
y2_test = y2_class0_test.append(y2_class1_test).append(y2_class2_test)

In [13]:
#scenario 3
s3_class0 = norm_data3[norm_data3['s3_decision'] == 0]
s3_class1 = norm_data3[norm_data3['s3_decision'] == 1]
s3_class2 = norm_data3[norm_data3['s3_decision'] == 2]
s3_class3 = norm_data3[norm_data3['s3_decision'] == 3]

X3_class0 = s3_class0.iloc[:, 0:11]
y3_class0 = s3_class0['s3_decision']
X3_class0_train, X3_class0_test, y3_class0_train, y3_class0_test = train_test_split(X3_class0, y3_class0, test_size=0.20, random_state=42)

X3_class1 = s3_class1.iloc[:, 0:11]
y3_class1 = s3_class1['s3_decision']
X3_class1_train, X3_class1_test, y3_class1_train, y3_class1_test = train_test_split(X3_class1, y3_class1, test_size=0.20, random_state=42)

X3_class2 = s3_class2.iloc[:, 0:11]
y3_class2 = s3_class2['s3_decision']
X3_class2_train, X3_class2_test, y3_class2_train, y3_class2_test = train_test_split(X3_class2, y3_class2, test_size=0.20, random_state=42)

X3_class3 = s3_class3.iloc[:, 0:11]
y3_class3 = s3_class3['s3_decision']
X3_class3_train, X3_class3_test, y3_class3_train, y3_class3_test = train_test_split(X3_class3, y3_class3, test_size=0.20, random_state=42)

X3_train = X3_class0_train.append(X3_class1_train).append(X3_class2_train).append(X3_class3_train)
y3_train = y3_class0_train.append(y3_class1_train).append(y3_class2_train).append(y3_class3_train)
X3_test = X3_class0_test.append(X3_class1_test).append(X3_class2_test).append(X3_class3_test)
y3_test = y3_class0_test.append(y3_class1_test).append(y3_class2_test).append(y3_class3_test)

# Logit Models : All Features

Assessment functions:

In [14]:
#overall percent accurate
def overall_percent_accuracy(actual_decision, predictions):
    return sum(actual_decision == predictions) / len(actual_decision)

#percent accurate per decision
def percent_acc_per_class(actual_decisions, predictions, decision_class):
    df = pd.DataFrame({'actual' : actual_decisions, 'prediction': predictions})
    df = df.loc[df['actual'] == decision_class, :]
    if len(df) > 0:
        return sum(df['actual'] == df['prediction']) / len(df)
    else: 
        return np.nan

# Scenario 1


FEATURES: 
FEATURE 1 - AGE
F2 - COMMITMENT
F3 - DISTANCE
F4 - CAREER 
F5 - PARTNER CAREER

In [15]:
logit_model_1 = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', 
                                   class_weight = {0: 0.5, 1:0.5}, random_state=0).fit(X1_train, y1_train)
prediction_1 = logit_model_1.predict(X1_train)
probs_1 = logit_model_1.predict_proba(X1_train)
coef_1 = logit_model_1.coef_ #Is this B0 or B1?
intercept_1 = logit_model_1.intercept_
print('coef_1 is', coef_1)
print('intercept is', intercept_1) 

coef_1 is [[ 0.06749925  0.08090934 -0.0186473   0.04756787 -0.07176826]]
intercept is [0.00036613]


In [16]:
#accuracies of scenario 1 logit models
s1_overall_acc = overall_percent_accuracy(y1_train, prediction_1)
s1_acc_class0 = percent_acc_per_class(y1_train, prediction_1, 0)
s1_acc_class1 = percent_acc_per_class(y1_train, prediction_1, 1)
print('s1 overall accuracy is', s1_overall_acc)
print('class 0 accuracy is', s1_acc_class0)
print('class 1 accuracy is', s1_acc_class1)

s1 overall accuracy is 0.9242424242424242
class 0 accuracy is 0.0
class 1 accuracy is 1.0


### Dataset balancing using class_weight='balanced'

Penalizes the misclassification of the minority class more than the misclassification of the majority class by weighting the loss of each sample by its class weight.

In [17]:
logit_model_1_balanced = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', 
                                   class_weight = 'balanced', random_state=0).fit(X1_train, y1_train)
prediction_1_balanced = logit_model_1_balanced.predict(X1_train)
probs_1_balanced = logit_model_1_balanced.predict_proba(X1_train)
coef_1_balanced = logit_model_1_balanced.coef_ #Is this B0 or B1?
intercept_1_balanced = logit_model_1_balanced.intercept_
print('coef_1 is', coef_1_balanced)
print('intercept is', intercept_1_balanced) 

coef_1 is [[ 0.1296166   0.05868102 -0.07524433  0.06669653 -0.16321679]]
intercept is [-0.00159057]


In [18]:
#accuracies of scenario 1 balanced logit models
s1_overall_acc_balanced = overall_percent_accuracy(y1_train, prediction_1_balanced)
s1_acc_class0_balanced = percent_acc_per_class(y1_train, prediction_1_balanced, 0)
s1_acc_class1_balanced = percent_acc_per_class(y1_train, prediction_1_balanced, 1)
print('s1 overall accuracy is', s1_overall_acc_balanced)
print('class 0 accuracy is', s1_acc_class0_balanced)
print('class 1 accuracy is', s1_acc_class1_balanced)

s1 overall accuracy is 0.5909090909090909
class 0 accuracy is 0.8
class 1 accuracy is 0.5737704918032787


### Dataset balancing using RandomOverSampler

Creates new data by randomly resampling from the minority class until it is the same size as the majority class.

In [19]:
# resample dataset
ros = RandomOverSampler(random_state=42)
X1_train_os, y1_train_os = ros.fit_sample(X1_train, y1_train)

logit_model_1_os = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', 
                                      random_state=0).fit(X1_train_os, y1_train_os)
prediction_1_os = logit_model_1_os.predict(X1_train_os)
probs_1_os = logit_model_1_os.predict_proba(X1_train_os)
coef_1_os = logit_model_1_os.coef_ #Is this B0 or B1?
intercept_1_os = logit_model_1_os.intercept_
print('coef_1 is', coef_1_os)
print('intercept is', intercept_1_os) 

coef_1 is [[ 0.15241212  0.03272325 -0.07968258  0.06975754 -0.15027916]]
intercept is [-0.00131733]


In [20]:
#accuracies of scenario 1 balanced logit models
s1_overall_acc_os = overall_percent_accuracy(y1_train_os, prediction_1_os)
s1_acc_class0_os = percent_acc_per_class(y1_train_os, prediction_1_os, 0)
s1_acc_class1_os = percent_acc_per_class(y1_train_os, prediction_1_os, 1)
print('s1 overall accuracy is', s1_overall_acc_os)
print('class 0 accuracy is', s1_acc_class0_os)
print('class 1 accuracy is', s1_acc_class1_os)

s1 overall accuracy is 0.7213114754098361
class 0 accuracy is 0.8032786885245902
class 1 accuracy is 0.639344262295082


### Dataset balancing using SMOTE (another oversampling method)

SMOTE (Synthetic Minority Oversampling Technique) works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.

In [21]:
# resample dataset
ros_smote = SMOTE(random_state=42, k_neighbors = 4)
X1_train_smote, y1_train_smote = ros_smote.fit_sample(X1_train, y1_train)

logit_model_1_smote = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', 
                                      random_state=0).fit(X1_train_smote, y1_train_smote)
prediction_1_smote = logit_model_1_smote.predict(X1_train_smote)
probs_1_smote = logit_model_1_smote.predict_proba(X1_train_smote)
coef_1_smote = logit_model_1_smote.coef_ #Is this B0 or B1?
intercept_1_smote = logit_model_1_smote.intercept_
print('coef_1 is', coef_1_smote)
print('intercept is', intercept_1_smote) 

coef_1 is [[ 0.12375419  0.07862248 -0.06293331  0.06455672 -0.1865827 ]]
intercept is [-0.00110049]


In [22]:
#accuracies of scenario 1 balanced logit models
s1_overall_acc_smote = overall_percent_accuracy(y1_train_smote, prediction_1_smote)
s1_acc_class0_smote = percent_acc_per_class(y1_train_smote, prediction_1_smote, 0)
s1_acc_class1_smote = percent_acc_per_class(y1_train_smote, prediction_1_smote, 1)
print('s1 overall accuracy is', s1_overall_acc_smote)
print('class 0 accuracy is', s1_acc_class0_smote)
print('class 1 accuracy is', s1_acc_class1_smote)

s1 overall accuracy is 0.6557377049180327
class 0 accuracy is 0.7377049180327869
class 1 accuracy is 0.5737704918032787


### Dataset balancing using RandomUnderSampler

Uses a subset of the majority class by sampling from the majority class until it is the same size as the minority class.

Probably don't want to use this one since we'd be training on too little samples, but just ran it anyway to see.

In [23]:
# resample dataset
rus = RandomUnderSampler(random_state=42)
X1_train_us, y1_train_us = rus.fit_sample(X1_train, y1_train)

logit_model_1_us = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', 
                                      random_state=0).fit(X1_train_us, y1_train_us)
prediction_1_us = logit_model_1_us.predict(X1_train_us)
probs_1_us = logit_model_1_us.predict_proba(X1_train_us)
coef_1_us = logit_model_1_us.coef_ #Is this B0 or B1?
intercept_1_us = logit_model_1_us.intercept_
print('coef_1 is', coef_1_us)
print('intercept is', intercept_1_us) 

coef_1 is [[ 0.23874136  0.08044509 -0.08685134  0.01946843 -0.21214738]]
intercept is [-0.0020063]


In [24]:
#accuracies of scenario 1 balanced logit models
s1_overall_acc_us = overall_percent_accuracy(y1_train_us, prediction_1_us)
s1_acc_class0_us = percent_acc_per_class(y1_train_us, prediction_1_us, 0)
s1_acc_class1_us = percent_acc_per_class(y1_train_us, prediction_1_us, 1)
print('s1 overall accuracy is', s1_overall_acc_us)
print('class 0 accuracy is', s1_acc_class0_us)
print('class 1 accuracy is', s1_acc_class1_us)

s1 overall accuracy is 0.8
class 0 accuracy is 0.8
class 1 accuracy is 0.8


# Scenario 2

FEATURES: 

FEATURE 1 - AGE
F2 - COMMITMENT
F3 - DISTANCE
F4 - CAREER 
F5 - PARTNER CAREER
F6 - COMMUNICATION
F7 - YOUR TIME
F8 - PARTNER TIME
F9 - SECURITY
F10 - COMPROMISE


In [25]:
logit_model_2 = LogisticRegression(multi_class = 'ovr', solver = 'newton-cg', random_state=0).fit(X2_train, y2_train)
prediction_2 = logit_model_2.predict(X2_train)
probs_2 = logit_model_2.predict_proba(X2_train)
coef_2 = logit_model_2.coef_
intercept_2 = logit_model_2.intercept_
print('coef_2 is', coef_2)
print('intercept is', intercept_2) 

coef_2 is [[ 0.03652514 -0.04405781 -0.01079064 -0.36236267  0.29415102 -0.13567996
  -0.08797653  0.18410706 -0.15191982  0.34609442]
 [ 0.14479592  0.16169266 -0.09465949  0.65324157 -0.36846094  0.00192706
  -0.44965251  0.41335367 -0.03526419 -0.35173456]
 [-0.55204745 -0.1770334   0.29883912 -0.38178328  0.04355849  0.65693067
   0.13698815 -0.98185319  0.34819416  0.09408858]]
intercept is [-1.6129546  -2.02123075  6.58155584]


In [26]:
#accuracies of scenario 2 logit models
s2_overall_acc = overall_percent_accuracy(y2_train, prediction_2)
s2_acc_class0 = percent_acc_per_class(y2_train, prediction_2, 0)
s2_acc_class1 = percent_acc_per_class(y2_train, prediction_2, 1)
s2_acc_class2 = percent_acc_per_class(y2_train, prediction_2, 2)
print('s2 overall accuracy is', s2_overall_acc)
print('class 0 accuracy is', s2_acc_class0)
print('class 1 accuracy is', s2_acc_class1)
print('class 2 accuracy is', s2_acc_class2)

s2 overall accuracy is 0.6515151515151515
class 0 accuracy is 0.25
class 1 accuracy is 0.868421052631579
class 2 accuracy is 0.625


### Dataset balancing using class_weight='balanced'

Penalizes the misclassification of the minority class more than the misclassification of the majority class by weighting the loss of each sample by its class weight.

In [27]:
df = norm_data2.loc[np.array(X2_train.index), :][['age', 's2_commitment', 's2_distance', 's2_communication',
                                                  's2_partner_time', 's2_security', 's2_compromise', 's2_decision']]
df = normalize(df)
d_y2_train = df['s2_decision']
d_X2_train = df.drop(['s2_decision'], axis = 1)
d_X2_train.head()

Unnamed: 0,age,s2_commitment,s2_distance,s2_communication,s2_partner_time,s2_security,s2_compromise
20,14.0,19.609665,5.885998,17.642503,17.642503,19.609665,19.609665
48,21.0,17.02569,14.887506,12.765278,21.270145,17.02569,17.02569
2,13.0,19.351318,12.911274,13.987213,17.199439,19.351318,17.199439
76,20.0,21.734234,8.703346,21.734234,17.390605,8.703346,21.734234
10,13.0,17.53915,15.794183,15.794183,17.53915,15.794183,17.53915


In [28]:
logit_model_2_balanced = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg', 
                                            class_weight='balanced', random_state=0).fit(d_X2_train, d_y2_train)
prediction_2_balanced = logit_model_2_balanced.predict(d_X2_train)
probs_2_balanced = logit_model_2_balanced.predict_proba(d_X2_train)
coef_2_balanced = logit_model_2_balanced.coef_
intercept_2_balanced = logit_model_2_balanced.intercept_
print('coef_2 is', coef_2_balanced)
print('intercept is', intercept_2_balanced)

coef_2 is [[ 0.2284229   0.02621836 -0.03109945 -0.32777722  0.36475219 -0.07759628
   0.04544476]
 [ 0.2840156   0.06207761 -0.07570234 -0.1997155   0.44751345 -0.047684
  -0.18643006]
 [-0.51243621 -0.08829399  0.10680255  0.52749333 -0.81226336  0.12528523
   0.14099007]]
intercept is [-3.30255866 -4.57257361  7.87513227]


In [29]:
#accuracies of scenario 2 logit models
s2_overall_acc_balanced = overall_percent_accuracy(d_y2_train, prediction_2_balanced)
s2_acc_class0_balanced = percent_acc_per_class(d_y2_train, prediction_2_balanced, 0)
s2_acc_class1_balanced = percent_acc_per_class(d_y2_train, prediction_2_balanced, 1)
s2_acc_class2_balanced = percent_acc_per_class(d_y2_train, prediction_2_balanced, 2)
print('s2 overall accuracy is', s2_overall_acc_balanced)
print('class 0 accuracy is', s2_acc_class0_balanced)
print('class 1 accuracy is', s2_acc_class1_balanced)
print('class 2 accuracy is', s2_acc_class2_balanced)

s2 overall accuracy is 0.5606060606060606
class 0 accuracy is 0.55
class 1 accuracy is 0.5
class 2 accuracy is 0.875


### Dataset balancing using SMOTE (another oversampling method)

SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.

In [30]:
# resample dataset
ros_smote2 = SMOTE(random_state=42)
X2_train_smote, y2_train_smote = ros_smote2.fit_sample(d_X2_train, d_y2_train)

logit_model_2_smote = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg', 
                                      random_state=0).fit(X2_train_smote, y2_train_smote)
prediction_2_smote = logit_model_2_smote.predict(X2_train_smote)
probs_2_smote = logit_model_2_smote.predict_proba(X2_train_smote)
coef_2_smote = logit_model_2_smote.coef_ 
intercept_2_smote = logit_model_2_smote.intercept_
print('coef_2 is', coef_2_smote)
print('intercept is', intercept_2_smote) 

coef_2 is [[ 0.31741291  0.02169354 -0.05175676 -0.41987289  0.51047893 -0.05793536
  -0.00272048]
 [ 0.38811063  0.13935143 -0.11222223 -0.30472341  0.56784725 -0.09914028
  -0.19102632]
 [-0.705524   -0.16104942  0.16397395  0.72459017 -1.0783257   0.15707221
   0.19374316]]
intercept is [-4.51833861 -6.18805309 10.7063917 ]


In [31]:
#accuracies of scenario 2 logit models
s2_overall_acc_smote = overall_percent_accuracy(y2_train_smote, prediction_2_smote)
s2_acc_class0_smote = percent_acc_per_class(y2_train_smote, prediction_2_smote, 0)
s2_acc_class1_smote = percent_acc_per_class(y2_train_smote, prediction_2_smote, 1)
s2_acc_class2_smote = percent_acc_per_class(y2_train_smote, prediction_2_smote, 2)
print('s2 overall accuracy is', s2_overall_acc_smote)
print('class 0 accuracy is', s2_acc_class0_smote)
print('class 1 accuracy is', s2_acc_class1_smote)
print('class 2 accuracy is', s2_acc_class2_smote)

s2 overall accuracy is 0.6491228070175439
class 0 accuracy is 0.6052631578947368
class 1 accuracy is 0.42105263157894735
class 2 accuracy is 0.9210526315789473


# Scenario 3

In [32]:
logit_model_3 = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg', random_state=0).fit(X3_train, y3_train)
prediction_3 = logit_model_3.predict(X3_train)
probs_3 = logit_model_3.predict_proba(X3_train)
coef_3 = logit_model_3.coef_
intercept_3 = logit_model_3.intercept_
print('coef_3 is', coef_3)
print('intercept is', intercept_3)

coef_3 is [[ 0.03037894  0.63581802 -0.05697129 -0.06571776 -0.40123813 -0.20115731
  -0.71315925  0.34703376  0.00631019  0.09328293  0.38172993]
 [-0.25744151 -0.32246103 -0.04536932  0.2145025  -0.05520957 -0.13041688
   0.00610347  0.25136681  0.35807549  0.29525451 -0.68698533]
 [ 0.06198748  0.4176855  -0.00975899 -0.28197783 -0.07491349 -0.31048837
   0.16405585 -0.08980682  0.12220693  0.0815944   0.0649113 ]
 [ 0.16507509 -0.73104249  0.1120996   0.13319309  0.53136119  0.64206256
   0.54299994 -0.50859375 -0.48659261 -0.47013183  0.2403441 ]]
intercept is [ 0.69590204  6.01942797  0.08175599 -6.79708601]


In [33]:
s3_overall_acc = overall_percent_accuracy(y3_train, prediction_3)
s3_acc_class0 = percent_acc_per_class(y3_train, prediction_3, 0)
s3_acc_class1 = percent_acc_per_class(y3_train, prediction_3, 1)
s3_acc_class2 = percent_acc_per_class(y3_train, prediction_3, 2)
s3_acc_class3 = percent_acc_per_class(y3_train, prediction_3, 3)
print('s3 overall accuracy is', s3_overall_acc)
print('class 0 accuracy is', s3_acc_class0)
print('class 1 accuracy is', s3_acc_class1)
print('class 2 accuracy is', s3_acc_class2)
print('class 3 accuracy is', s3_acc_class3)

s3 overall accuracy is 0.5757575757575758
class 0 accuracy is 0.25
class 1 accuracy is 0.7368421052631579
class 2 accuracy is 0.6071428571428571
class 3 accuracy is 1.0


### Dataset balancing using class_weight='balanced'

Penalizes the misclassification of the minority class more than the misclassification of the majority class by weighting the loss of each sample by its class weight.

In [34]:
df = norm_data3.loc[np.array(X3_train.index), :][['age', 's3_communication', 's3_your_time',
                                                 's3_family', 's3_partner_time',
                                                 's3_marriage', 's3_commitment',
                                                 's3_decision']]
df = normalize(df)
d_y3_train = df['s3_decision']
d_X3_train = df.drop(['s3_decision'], axis = 1)
d_X3_train.head()

Unnamed: 0,age,s3_communication,s3_your_time,s3_family,s3_partner_time,s3_marriage,s3_commitment
38,20.0,17.312408,15.383358,15.383358,15.383358,17.312408,19.22511
16,14.0,21.050847,20.0,3.152542,20.0,14.745763,21.050847
50,23.0,19.230769,19.230769,3.846154,19.230769,19.230769,19.230769
5,14.0,21.428571,14.285714,7.142857,21.428571,14.285714,21.428571
71,20.0,16.98204,18.858242,13.213598,16.98204,16.98204,16.98204


In [35]:
logit_model_3_balanced = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg', 
                                            class_weight='balanced', random_state=0).fit(d_X3_train, d_y3_train)
prediction_3_balanced = logit_model_3_balanced.predict(d_X3_train)
probs_3_balanced = logit_model_3_balanced.predict_proba(d_X3_train)
coef_3_balanced = logit_model_3_balanced.coef_
intercept_3_balanced = logit_model_3_balanced.intercept_
print('coef_3 is', coef_3_balanced)
print('intercept is', intercept_3_balanced)

coef_3 is [[ 0.00653214 -0.34700892 -0.59942077  0.16691671  0.13770882  0.19350172
   0.44873094]
 [-0.21059624 -0.25527562 -0.0263811   0.34145822  0.31813662 -0.39490766
   0.01625554]
 [ 0.00690453 -0.38148766 -0.21828661  0.16376084  0.05111185  0.03244631
   0.35295282]
 [ 0.19715956  0.98377224  0.84408853 -0.67213573 -0.50695725  0.16895968
  -0.81793929]]
intercept is [  3.94981696   6.80089836   3.8199915  -14.57070682]


In [36]:
s3_overall_acc_balanced = overall_percent_accuracy(y3_train, prediction_3_balanced)
s3_acc_class0_balanced = percent_acc_per_class(d_y3_train, prediction_3_balanced, 0)
s3_acc_class1_balanced = percent_acc_per_class(d_y3_train, prediction_3_balanced, 1)
s3_acc_class2_balanced = percent_acc_per_class(d_y3_train, prediction_3_balanced, 2)
s3_acc_class3_balanced = percent_acc_per_class(d_y3_train, prediction_3_balanced, 3)
print('s3 overall accuracy is', s3_overall_acc_balanced)
print('class 0 accuracy is', s3_acc_class0_balanced)
print('class 1 accuracy is', s3_acc_class1_balanced)
print('class 2 accuracy is', s3_acc_class2_balanced)
print('class 3 accuracy is', s3_acc_class3_balanced)

s3 overall accuracy is 0.6060606060606061
class 0 accuracy is 0.75
class 1 accuracy is 0.7894736842105263
class 2 accuracy is 0.35714285714285715
class 3 accuracy is 1.0


### Dataset balancing using SMOTE (another oversampling method)

SMOTE works by selecting examples that are close in the feature space, drawing a line between the examples in the feature space and drawing a new sample at a point along that line.

In [37]:
# resample dataset
ros_smote3 = SMOTE(random_state=42, k_neighbors=2)
X3_train_smote, y3_train_smote = ros_smote3.fit_sample(X3_train, y3_train)

logit_model_3_smote = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg', 
                                      random_state=0).fit(X3_train_smote, y3_train_smote)
prediction_3_smote = logit_model_3_smote.predict(X3_train_smote)
probs_3_smote = logit_model_3_smote.predict_proba(X3_train_smote)
coef_3_smote = logit_model_3_smote.coef_ 
intercept_3_smote = logit_model_3_smote.intercept_
print('coef_3 is', coef_3_smote)
print('intercept is', intercept_3_smote) 

coef_3 is [[-2.42106623e-04  7.50486430e-01 -3.12667989e-02 -2.25691962e-02
  -4.66632442e-01 -2.85351124e-01 -9.49498582e-01  4.74779247e-01
   5.32420743e-02  9.27362166e-02  4.17465863e-01]
 [-2.36381617e-01 -3.90459508e-01 -5.62441015e-02  2.05771398e-01
  -1.02155991e-01 -2.04332210e-01  2.56178650e-02  3.63681030e-01
   4.00320549e-01  3.65865700e-01 -7.50357053e-01]
 [ 5.94961412e-02  6.02057204e-01 -2.21154440e-02 -3.03668268e-01
  -1.45475697e-01 -4.31189057e-01  1.85564457e-01 -1.13111021e-01
   1.70283617e-01  1.04811483e-01  5.48527434e-02]
 [ 1.77127582e-01 -9.62084127e-01  1.09626344e-01  1.20466066e-01
   7.14264129e-01  9.20872390e-01  7.38316260e-01 -7.25349256e-01
  -6.23846241e-01 -5.63413400e-01  2.78038447e-01]]
intercept is [ 1.53862227  6.26751869 -0.31507804 -7.49106292]


In [38]:
s3_overall_acc_smote = overall_percent_accuracy(y3_train_smote, prediction_3_smote)
s3_acc_class0_smote = percent_acc_per_class(y3_train_smote, prediction_3_smote, 0)
s3_acc_class1_smote = percent_acc_per_class(y3_train_smote, prediction_3_smote, 1)
s3_acc_class2_smote = percent_acc_per_class(y3_train_smote, prediction_3_smote, 2)
s3_acc_class3_smote = percent_acc_per_class(y3_train_smote, prediction_3_smote, 3)
print('s3 overall accuracy is', s3_overall_acc_smote)
print('class 0 accuracy is', s3_acc_class0_smote)
print('class 1 accuracy is', s3_acc_class1_smote)
print('class 2 accuracy is', s3_acc_class2_smote)
print('class 3 accuracy is', s3_acc_class3_smote)

s3 overall accuracy is 0.7053571428571429
class 0 accuracy is 0.6071428571428571
class 1 accuracy is 0.75
class 2 accuracy is 0.4642857142857143
class 3 accuracy is 1.0


# Logit Models: Initial Feature Selection (w/o DB)

# Scenario 1

FEATURES: COMMITMENT, PARTNER CAREER, DISTANCE (pretty strong, also pretty strong by just excluding age) 
observation: distance w/o age is positive coeff

In [39]:
df = data_1.loc[np.array(X1_train.index), :][['age','s1_commitment', 's1_partner_career', 's1_distance',
                                             's1_decision']]
df = normalize(df)
f_y1_train = df['s1_decision']
f_X1_train = df.drop(['s1_decision'], axis = 1)
f_X1_train.head()

Unnamed: 0,age,s1_commitment,s1_partner_career,s1_distance
48,21.0,29.166667,33.333333,37.5
20,14.0,43.478261,43.478261,13.043478
31,13.0,36.111111,47.222222,16.666667
23,13.0,44.444444,27.777778,27.777778
55,22.0,33.333333,33.333333,33.333333


In [40]:
f_model_1 = LogisticRegression(multi_class = 'ovr', solver = 'liblinear', random_state=0).fit(f_X1_train, f_y1_train)
f_prediction_1 = f_model_1.predict(f_X1_train)
f_probs_1 = f_model_1.predict_proba(f_X1_train)
f_coef_1 = f_model_1.coef_ #B1
f_intercept_1 = f_model_1.intercept_
print('coef_1 is', f_coef_1)
print('intercept is', f_intercept_1)

coef_1 is [[ 0.07156502  0.07317657 -0.03556449 -0.01018125]]
intercept is [0.00027431]


In [41]:
#accuracies of scenario 1 w/ selected features 
s1_overall_acc_feat = overall_percent_accuracy(f_y1_train, f_prediction_1)
s1_acc_class0_feat = percent_acc_per_class(f_y1_train, f_prediction_1, 0)
s1_acc_class1_feat = percent_acc_per_class(f_y1_train, f_prediction_1, 1)
print('s1 overall accuracy is', s1_overall_acc_feat)
print('class 0 accuracy is', s1_acc_class0_feat)
print('class 1 accuracy is', s1_acc_class1_feat)

s1 overall accuracy is 0.9242424242424242
class 0 accuracy is 0.0
class 1 accuracy is 1.0


# Scenario 2

In [42]:
df = norm_data2.loc[np.array(X2_train.index), :].drop(['s2_partner_career', 's2_career', 's2_your_time', 's2_security'], axis = 1)
df = normalize(df)
f_y2_train = df['s2_decision']
f_X2_train = df.drop(['s2_decision'], axis = 1)
f_X2_train.head()

Unnamed: 0,age,s2_commitment,s2_distance,s2_communication,s2_partner_time,s2_compromise
20,14.0,24.393064,7.321773,21.94605,21.94605,24.393064
48,21.0,20.519231,17.942308,15.384615,25.634615,20.519231
2,13.0,23.994586,16.009281,17.343387,21.326373,21.326373
76,20.0,23.806167,9.53304,23.806167,19.048458,23.806167
10,13.0,20.828905,18.756642,18.756642,20.828905,20.828905


In [43]:
f_model_2 = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg', random_state=0).fit(f_X2_train, f_y2_train)
f_prediction_2 = f_model_2.predict(f_X2_train)
f_probs_2 = f_model_2.predict_proba(f_X2_train)
f_coef_2 = f_model_2.coef_ 
f_intercept_2 = f_model_2.intercept_
print('coef_2 is', f_coef_2)
print('intercept is', f_intercept_2)

coef_2 is [[ 0.20506809  0.03157688 -0.0274028  -0.23725712  0.23901861 -0.00601848]
 [ 0.2736407   0.06732109 -0.05880414 -0.14900334  0.30734091 -0.16688189]
 [-0.47872442 -0.09893592  0.0862095   0.3862428  -0.54641779  0.17284978]]
intercept is [-2.9542027 -3.7772444  6.7314471]


In [44]:
s2_overall_acc_feat = overall_percent_accuracy(f_y2_train, f_prediction_2)
s2_acc_class0_feat = percent_acc_per_class(f_y2_train, f_prediction_2, 0)
s2_acc_class1_feat = percent_acc_per_class(f_y2_train, f_prediction_2, 1)
s2_acc_class2_feat = percent_acc_per_class(f_y2_train, f_prediction_2, 2)
print('s1 overall accuracy is', s2_overall_acc_feat)
print('class 0 accuracy is', s2_acc_class0_feat)
print('class 1 accuracy is', s2_acc_class1_feat)
print('class 2 accuracy is', s2_acc_class2_feat)

s1 overall accuracy is 0.6666666666666666
class 0 accuracy is 0.25
class 1 accuracy is 0.9210526315789473
class 2 accuracy is 0.5


# Scenario 3

In [45]:
df = norm_data3.loc[np.array(X3_train.index), :][['age', 's3_communication', 's3_your_time',
                                                  's3_partner_time', 's3_family', 's3_marriage',
                                                 's3_decision']]
df = normalize(df)
f_y3_train = df['s3_decision']
f_X3_train = df.drop(['s3_decision'], axis = 1)
f_X3_train.head()

Unnamed: 0,age,s3_communication,s3_your_time,s3_partner_time,s3_family,s3_marriage
38,20.0,21.432908,19.044728,19.044728,19.044728,21.432908
16,14.0,26.663804,25.332761,25.332761,3.99313,18.677544
50,23.0,23.809524,23.809524,23.809524,4.761905,23.809524
5,14.0,27.272727,18.181818,27.272727,9.090909,18.181818
71,20.0,20.455862,22.715859,20.455862,15.916554,20.455862


In [46]:
f_model_3 = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg', random_state=0).fit(f_X3_train, f_y3_train)
f_prediction_3 = f_model_3.predict(f_X3_train)
f_probs_3 = f_model_3.predict_proba(f_X3_train)
f_coef_3 = f_model_3.coef_ 
f_intercept_3 = f_model_3.intercept_
print('coef_2 is', f_coef_3)
print('intercept is', f_intercept_3)

coef_2 is [[ 0.08955699 -0.06800256 -0.54913607  0.39786381  0.08747538  0.13196308]
 [-0.16479006 -0.12334947 -0.03712273  0.25000315  0.23909856 -0.32886322]
 [ 0.09775442 -0.11059489 -0.19542052  0.23907488  0.0774758  -0.01037771]
 [-0.02252136  0.3019469   0.7816793  -0.88694187 -0.40404976  0.20727783]]
intercept is [ 1.56331574  5.0803856   1.63166316 -8.2753645 ]


In [47]:
ros_smote3 = SMOTE(random_state=42, k_neighbors=2) #k_neighbors=2
X3_train_smote, y3_train_smote = ros_smote3.fit_sample(f_X3_train, f_y3_train)

logit_model_3_smote = LogisticRegression(multi_class = 'multinomial', solver = 'newton-cg', 
                                      random_state=0).fit(X3_train_smote, y3_train_smote)
prediction_3_smote = logit_model_3_smote.predict(X3_train_smote)
probs_3_smote = logit_model_3_smote.predict_proba(X3_train_smote)
coef_3_smote = logit_model_3_smote.coef_ 
intercept_3_smote = logit_model_3_smote.intercept_
print('coef_3 is', coef_3_smote)
print('intercept is', intercept_3_smote) 

coef_3 is [[ 0.07331574 -0.05417318 -0.73439227  0.56606439  0.11894598  0.10366991]
 [-0.09821903 -0.19402466 -0.07864917  0.33014689  0.28845168 -0.34623294]
 [ 0.10588888 -0.14678495 -0.26258493  0.34345273  0.10817812 -0.04196147]
 [-0.08097794  0.3949861   1.07563227 -1.23965835 -0.51556552  0.28452423]]
intercept is [ 2.60351571  4.89605319  1.75453463 -9.25410353]


In [48]:
s3_overall_acc_feat = overall_percent_accuracy(f_y3_train, f_prediction_3)
s3_acc_class0_feat = percent_acc_per_class(f_y3_train, f_prediction_3, 0)
s3_acc_class1_feat = percent_acc_per_class(f_y3_train, f_prediction_3, 1)
s3_acc_class2_feat = percent_acc_per_class(f_y3_train, f_prediction_3, 2)
s3_acc_class3_feat = percent_acc_per_class(f_y3_train, f_prediction_3, 3)
print('s1 overall accuracy is', s3_overall_acc_feat)
print('class 0 accuracy is', s3_acc_class0_feat)
print('class 1 accuracy is', s3_acc_class1_feat)
print('class 2 accuracy is', s3_acc_class2_feat)
print('class 3 accuracy is', s3_acc_class3_feat)

s1 overall accuracy is 0.6060606060606061
class 0 accuracy is 0.1875
class 1 accuracy is 0.7368421052631579
class 2 accuracy is 0.75
class 3 accuracy is 0.6666666666666666


# Predictions for Test Set

Scenario 1: SMOTE MODEL W/ ALL FEATURES

In [49]:
pred_y1_os = logit_model_1_os.predict(X1_test)
s1_overall_acc_test_os = overall_percent_accuracy(y1_test, pred_y1_os)
s1_acc_class0_test_os = percent_acc_per_class(y1_test, pred_y1_os, 0)
s1_acc_class1_test_os = percent_acc_per_class(y1_test, pred_y1_os, 1)
print('s1 overall accuracy is', s1_overall_acc_test_os)
print('class 0 accuracy is', s1_acc_class0_test_os)
print('class 1 accuracy is', s1_acc_class1_test_os)

s1 overall accuracy is 0.6666666666666666
class 0 accuracy is 1.0
class 1 accuracy is 0.625


In [50]:
logit_model_1_os.coef_, logit_model_1_os.intercept_

(array([[ 0.15241212,  0.03272325, -0.07968258,  0.06975754, -0.15027916]]),
 array([-0.00131733]))

Scenario 2: SMOTE MODEL W/ SELECTED FEATURES


In [54]:
df = norm_data2.loc[np.array(X2_test.index), :][['age', 's2_commitment', 's2_distance', 's2_communication',
                                                  's2_partner_time', 's2_security', 's2_compromise', 's2_decision']]
df = normalize(df)
X2_test = df.drop(['s2_decision'], axis = 1)

In [55]:
pred_y2_os = logit_model_2_smote.predict(X2_test)
s2_overall_acc_test_os = overall_percent_accuracy(y2_test, pred_y2_os)
s2_acc_class0_test_os = percent_acc_per_class(y2_test, pred_y2_os, 0)
s2_acc_class1_test_os = percent_acc_per_class(y2_test, pred_y2_os, 1)
s2_acc_class2_test_os = percent_acc_per_class(y2_test, pred_y2_os, 2)
print('s2 overall accuracy is', s2_overall_acc_test_os)
print('class 0 accuracy is', s2_acc_class0_test_os)
print('class 1 accuracy is', s2_acc_class1_test_os)
print('class 2 accuracy is', s2_acc_class2_test_os)

s2 overall accuracy is 0.5555555555555556
class 0 accuracy is 0.4
class 1 accuracy is 0.6
class 2 accuracy is 0.6666666666666666


In [56]:
logit_model_2_smote.coef_, logit_model_2_smote.intercept_

(array([[ 0.31741291,  0.02169354, -0.05175676, -0.41987289,  0.51047893,
         -0.05793536, -0.00272048],
        [ 0.38811063,  0.13935143, -0.11222223, -0.30472341,  0.56784725,
         -0.09914028, -0.19102632],
        [-0.705524  , -0.16104942,  0.16397395,  0.72459017, -1.0783257 ,
          0.15707221,  0.19374316]]),
 array([-4.51833861, -6.18805309, 10.7063917 ]))

Scenario 3: SMOTE MODEL W/ ALL FEATURES

In [62]:
pred_y3_os = logit_model_3_smote.predict(X3_test[['age', 's3_communication', 's3_your_time', 's3_partner_time', 's3_family', 's3_marriage']])
s3_overall_acc_test = overall_percent_accuracy(y3_test, pred_y3_os)
s3_acc_class0_test = percent_acc_per_class(y3_test, pred_y3_os, 0)
s3_acc_class1_test = percent_acc_per_class(y3_test, pred_y3_os, 1)
s3_acc_class2_test = percent_acc_per_class(y3_test, pred_y3_os, 2)
s3_acc_class3_test = percent_acc_per_class(y3_test, pred_y3_os, 3)
print('s1 overall accuracy is', s3_overall_acc_test)
print('class 0 accuracy is', s3_acc_class0_test)
print('class 1 accuracy is', s3_acc_class1_test)
print('class 2 accuracy is', s3_acc_class2_test)
print('class 3 accuracy is', s3_acc_class3_test)


s1 overall accuracy is 0.4444444444444444
class 0 accuracy is 0.75
class 1 accuracy is 0.6
class 2 accuracy is 0.25
class 3 accuracy is 0.0


In [74]:
logit_model_3_smote.coef_, logit_model_3_smote.intercept_

(array([[ 0.07331574, -0.05417318, -0.73439227,  0.56606439,  0.11894598,
          0.10366991],
        [-0.09821903, -0.19402466, -0.07864917,  0.33014689,  0.28845168,
         -0.34623294],
        [ 0.10588888, -0.14678495, -0.26258493,  0.34345273,  0.10817812,
         -0.04196147],
        [-0.08097794,  0.3949861 ,  1.07563227, -1.23965835, -0.51556552,
          0.28452423]]),
 array([ 2.60351571,  4.89605319,  1.75453463, -9.25410353]))