In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
import warnings
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from sklearn import metrics
import statsmodels.api as sm
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import RandomForestRegressor


In [None]:
categorical = pd.read_csv('files_for_lab/categorical7_02.csv')
numerical = pd.read_csv('files_for_lab/numerical7_02.csv')
targets = pd.read_csv('files_for_lab/target7_02.csv')

In [None]:
def drop(x):
    x = x.drop(columns='Unnamed: 0')
    return x
    
categorical = drop(categorical)
numerical = drop(numerical)
targets = drop(targets)

In [None]:
display(numerical.head(2))
display(targets.head(2))
display(categorical.head(2))
display(categorical.shape)

In [None]:
categorical['VETERANS'] = categorical['VETERANS'].fillna('N')
categorical = categorical.drop(columns = ['OSOURCE','NOEXCH','SOLIH','MDMAUD','ZIP'])
categorical['DATASRCE'] = categorical['DATASRCE'].dropna()


In [None]:
#Categorical

encoder = OneHotEncoder(drop='first').fit(categorical)
encoded_categorical = encoder.transform(categorical).toarray()
encoded_categorical = pd.DataFrame(encoded_categorical)
encoded_categorical

In [None]:
transformer = StandardScaler().fit(numerical)
numerical_scaled = transformer.transform(numerical)

In [None]:
numerical_scaled = pd.DataFrame(numerical_scaled, columns = numerical.columns)

In [None]:
all_data = pd.concat((numerical_scaled,targets,encoded_categorical),axis=1)
all_data.info()

In [None]:
for i in all_data.isna().sum():
    print(i)


In [None]:
newdat = all_data[all_data['TARGET_B']==1]
X = newdat.drop(columns=['TARGET_B','TARGET_D'])
y = newdat['TARGET_D']

In [None]:
y

In [None]:
y.shape

In [None]:
y.value_counts()

In [None]:
X_added_constant = sm.add_constant(X)
model = sm.OLS(y,X_added_constant).fit()
model.summary()

In [None]:
p = list(model.pvalues)
pp = pd.DataFrame(p, index=model.pvalues.index)
pvalues = pp[pp[0]<0.05]
pvalues.columns

In [None]:
#sort the values to find the top 20
pvalues['vals'] = pvalues[0]
pvalues = pvalues.drop(0, axis = 1)
pvalues = pvalues.sort_values(by = 'vals')
pvalues

In [None]:
top30 = pvalues[0:30]
top30.drop('const', inplace=True)

In [None]:
##Assign the columns to X
xcols = list(top30.index)

X = all_data[xcols]
y = all_data['TARGET_B']

In [None]:
#Heatmap doesn't show us much
sns.heatmap(X.corr())


In [None]:
X = X[['LASTGIFT',73, 'MINRAMNT', 'RAMNTALL',   'RFA_2F',         72,
       'NGIFTALL', 'NEXTDATE',         78,         81,  'TIMELAG',    'ETH16',
               31,         46,    'ETHC1',     'HUR1',         50,     'ETH1',
          'ETH13',    'ETH15']]

In [None]:
X.shape, y.shape

In [None]:
all_data['TARGET_B'].value_counts()

In [None]:
#train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)

In [None]:
#Scaling
transformer = MinMaxScaler().fit(X_train)

#Normalising X_train/X_test
X_train_normalised = transformer.transform(X_train)
X_test_normalised = transformer.transform(X_test)

#Back into dfs
X_train_normalised = pd.DataFrame(X_train_normalised, columns = X_train.columns) 
X_test_normalised = pd.DataFrame(X_test_normalised, columns = X_test.columns) 
display(X_train_normalised.head())
display(X_train_normalised.shape)

In [None]:
X_train_normalised.info()

In [None]:
y_train = y_train.reset_index(drop=True) 
y_test = y_test.reset_index(drop=True) 


In [None]:
classification_untreat = LogisticRegression(random_state=0, solver='saga', multi_class='multinomial').fit(X_train_normalised, y_train)

In [None]:
classification_untreat.score(X_test_normalised, y_test)

In [None]:
pred_untreat = classification_untreat.predict(X_test_normalised)

In [None]:
print(metrics.classification_report(y_test, pred_untreat))

In [None]:
dcombine = pd.concat((y_train, X_train_normalised), axis =1)
dcombine.head()

In [None]:
yes = dcombine[dcombine['TARGET_B'] == 1]
no = dcombine[dcombine['TARGET_B'] == 0] 

In [None]:
print('Yes',yes.shape)
print('No',no.shape)

In [None]:
nods = resample(no, replace=False, n_samples = len(yes))
print('Yes',yes.shape)
print('No',nods.shape)

In [None]:
combined_under = pd.concat([yes,nods], axis = 0)
combined_under.isna().sum()

In [None]:
#SPLIT X AND y for training

X_train_under = combined_under.drop(columns=['TARGET_B']) 
y_train_under = combined_under['TARGET_B']

In [None]:
classification_under = LogisticRegression(random_state=0, solver='saga', 
                                    multi_class='multinomial').fit(X_train_under, y_train_under)

In [None]:
classification_under.score(X_test_normalised, y_test)

In [None]:
pred_under = classification_under.predict(X_test_normalised)
print(metrics.classification_report(y_test, pred_under)) 

In [None]:
model1 = DecisionTreeClassifier()
model2 = LogisticRegression()

model_pipeline = [model1, model2]
model_names = ['Decision Tree Classifier', 'Logistic Regression']
scores = {}
for model, model_name in zip(model_pipeline, model_names):
    mean_score = np.mean(cross_val_score(model, X_train_under, y_train_under, cv=5))
    scores[model_name] = mean_score

In [None]:
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20,
                             max_samples=0.2,
                             random_state = 42)
clf.fit(X_train, y_train)
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))

y_pred = clf.predict(X_test)
display(y_test.value_counts())
display(confusion_matrix(y_test, y_pred))

In [None]:
from sklearn.model_selection import cross_val_score
clf = RandomForestClassifier(max_depth=5,
                             min_samples_split=20,
                             min_samples_leaf =20)
cross_val_scores = cross_val_score(clf, X_train, y_train, cv=10)
print(np.mean(cross_val_scores))

In [None]:
X_test


In [None]:
X_train = X_train.reset_index(drop = True)
X_test = X_test.reset_index(drop = True)
train = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)
test = pd.concat([X_test, y_test], axis=1)

In [None]:
train

In [None]:
train_donation = train[train['TARGET_B']==1].reset_index(drop = True)
test_donation = test[test['TARGET_B']==1].reset_index(drop = True)

In [None]:
X_train_donation = train_donation.drop(['TARGET_B'], axis=1)
X_test_donation = test_donation.drop(['TARGET_B'], axis=1)
y_train_donation = newdat['TARGET_D']
y_test_donation = newdat['TARGET_D']


In [None]:

reg = RandomForestRegressor(max_depth=4, #Max number of questions to ask
                             min_samples_split=2, #Amount of rows still considered at every Q
                             min_samples_leaf =10, #Ulimate answer based on at least this many rows
                             max_samples=0.8, #Fraction of X-train to use in each tree (hyperparameter to tweak)
                             random_state = 42)
reg.fit(X_train_donation, y_train_donation)
print('train score: ', reg.score(X_train, y_train))
print('test score: ', reg.score(X_test, y_test))
y_pred = reg.predict(X_test_donation)
print('mse: ',metrics.mean_squared_error(y_test_donation, y_pred))