## Team Member Names: Madeline Witters {-}

## Project Title: Predicting Customer Churn and Identifying Attributes of At-Risk Customers {-}

**Exploratory Data Analysis**

In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LassoCV
from sklearn.preprocessing import StandardScaler

In [None]:
data = pd.read_csv("Bank Customer Churn Prediction.csv")
data = data.drop("customer_id", axis=1)
data.head()

In [None]:
data.dtypes

In [None]:
#EDA to do list: 
# drop customer_id column DONE
# check for missing data DONE
# create boxplots for categorical variables DONE
# create density plots/histograms for quantitative vars DONE
#One Hot encoding: gender, country, Balance DONE

#info on why it's best to not do too much one-hot encoding for trees: 
#https://towardsdatascience.com/one-hot-encoding-is-making-your-tree-based-ensembles-worse-heres-why-d64b282b5769

#11/17 TO DO LIST: 

#outlier removal (do before standardization): 
#https://medium.com/geekculture/essential-guide-to-handle-outliers-for-your-logistic-regression-model-63c97690a84d

#check for multicollinearity: corr plot? REVISE: create ONLY for numeric vars; remove categorical vars (remove gender, country, active, cc)

#create mosaic plots for categorical vars

#create additional boxplots for numerical vars ?

#split into train/test DONE

#standardize data DONE

#Lasso variable selection DONE (conduct research on what vars to include: perhaps simply include all for now ?)

#Create Logistic regression model

#Create RF model
#   -identify most important vars in RF

In [None]:
null_check = data.isnull().any() #no missing data in the dataframe
null_check

In [None]:
#data.min()

In [None]:
#data.max()

In [None]:
plt.figure(figsize=(20,20))
#plt.title("Boxplots of Numeric Dependent Variables") try to add in tile later ?

plt.subplot(3,2,1)
sns.boxplot(x='churn', y='credit_score', data=data)

plt.subplot(3,2,2)
sns.boxplot(x='churn', y='age', data=data)

plt.subplot(3,2,3)
sns.boxplot(x='churn', y='tenure', data=data)

plt.subplot(3,2,4)
sns.boxplot(x='churn', y='balance', data=data)

plt.subplot(3,2,5)
sns.boxplot(x='churn', y='products_number', data=data)

plt.subplot(3,2,6)
sns.boxplot(x='churn', y='estimated_salary', data=data)

In [None]:
sns.histplot(data=data, x="balance", kde=True)

In [None]:
data['churn'].value_counts()

In [None]:
g_one_hot = pd.get_dummies(data['gender'])
g_one_hot.head()

In [None]:
data = data.drop('gender',axis = 1)
# Join the encoded df
data = data.join(g_one_hot)
data.head()

In [None]:
c_one_hot = pd.get_dummies(data['country'])
#c_one_hot.head()

In [None]:
data2 = data.drop('country',axis = 1)
data2 = data2.join(c_one_hot)
#data2.head()

In [None]:
data2['zero_balance'] = np.where(data2['balance'] == 0.0, 1, 0) #leave this for later on! 

In [None]:
#data2.head()

In [None]:
data2 = data2.drop('balance',axis = 1)

In [None]:
first_column = data2.pop('churn')
data2.insert(0, 'churn', first_column)

In [None]:
data2.head()

In [None]:
data2.dtypes

In [None]:
numeric_vars = data2[['churn', 'credit_score', 'age', 'tenure','products_number', 'estimated_salary']]

In [None]:
corr_matrix = numeric_vars.corr().round(2)
print(corr_matrix)

In [None]:
plt.figure(figsize = (14,7))
sns.heatmap(corr_matrix, annot=True, cmap='Blues')
plt.title(label="Correlation Matrix for Numeric Variables")
plt.show()

**Variable Selection**

In [None]:
data2.head()

In [None]:
train, test = train_test_split(data2, test_size=0.2, random_state=4)

In [None]:
train['churn'].value_counts()

In [None]:
test['churn'].value_counts()

In [None]:
scaler = StandardScaler() #note: need to fix/finish scaling and then re-run Lasso
scaled_train = scaler.fit_transform(train)
scaled_test = scaler.transform(test)

In [None]:
print(scaled_train[:,:1])

In [None]:
lasso = LassoCV(cv=10).fit(scaled_train[:,1:], np.ravel(scaled_train[:,:1]))

In [None]:
lasso.coef_

In [None]:
#coeffs identified as important via Lasso: all except Spain? but all very small.
#note different result when balance is categorical

In [None]:
lasso.alpha_

In [None]:
#https://www.datacamp.com/tutorial/tutorial-lasso-ridge-regression