In [None]:
import pandas as pd     # load and manipulate data for One-Hot Encoding
import numpy as np     # calculate the mean and standard deviation
import xgboost as xgb     # XGBoost stuff
from sklearn.model_selection import train_test_split     # split data into training and test sets
from sklearn.metrics import balanced_accuracy_score, roc_auc_score, make_scorer
from sklearn.model_selection import GridSearchCV     # cross validation
from sklearn.metrics import confusion_matrix     # creates a confusion matrix
from sklearn.metrics import plot_confusion_matrix     # draws a confusion matrix

In [None]:
df = pd.read_csv("C:/Users/lynst/Documents/Datasets/Kaggle/Jack Chang/telco_churn.csv")

In [None]:
df.head()

In [None]:
# set axis=0 to remove rows, axis=1 to remove cols
df.drop(['Churn Label','Churn Score','CLTV','Churn Reason'], axis=1, inplace=True)
df.head()

Some of the other columns only contain a single value and will not be useful for classification. For example:

In [None]:
df['Count'].unique()

In [None]:
df['Country'].unique()

In [None]:
df['State'].unique()

So we can omit those variables. In contrast, City contains a bunch of different city names, so we will leave it in.

In [None]:
df['City'].unique()

We will also remove 'CustomerID' because it is different for every customer and useless for classification. Lastly we will drop 'Lat Long' because there are separate columns for Latitude and Longitude.

In [None]:
df.drop(['CustomerID','Count','Country','State','Lat Long'], axis=1, inplace=True)
df.head()

In [None]:
df['City'].replace(' ', '_', regex=True, inplace=True)
df.head()

### Note
Although it's okay to have whitespace in the city names in 'City' for XGBoost and classification, we can't have any whitespace if we want to draw a tree. So let's take care of that now by replacing the white space in the city names with an underscore character _. 

### Also Note
We can easily remove whitespaces from all values, not just city names, but we will wait to do that until after we have identified missing values.

In [None]:
df['City'].unique()[0:10]

We also need to eliminate the whitespace in the column names, so we will replace it with underscores.

In [None]:
df.columns = df.columns.str.replace()