## Helpful lines of code when going through the Data Science Methodology

#### data cleaning

In [None]:
    # removes a column from a pandas dataframe if the percentage of missing values is greater than a given `threshold` value. 
    # user provides a `unique_value_threshold` which removes a column if the percentage of unique values in that column is below the `unique_value_threshold`.   
def drop_columns(input_df, threshold, unique_value_threshold):
    df = input_df.copy()
    columns = df.columns
    for i in columns:
        unique= (df[i].nunique()/df[i].dropna().shape[0])*100
        missing = df[i].isnull().sum() * 100 / len(df[i])
        if unique < unique_value_threshold:
            del df[i]
        if missing > threshold:
            del df[i]
    return df

In [None]:
# input a column and return the count of missing values:
def total_missing(df,column_name):
    for col in df.columns: 
        if column_name in col:
            total = df[col].isnull().sum()
            ret_string = str(column_name) + " has " + str(total) + " missing values"
    return str(ret_string)

In [None]:
# returns as output the mean if the specified column is numerical and return a list of the mode(s) otherwise.
def calc_mean_mode(df, column_name):
    # your code here
    for col in df.columns:
        if column_name in col:
            if df[col].dtype != 'object':
                ret_value = round(df[col].mean(),2) #df[col].fillna(df[col].mean())
            else:
                ret_value = list(df[col].mode())
            return ret_value
    if column_name not in df:
        return ValueError('could not find column')

#### data wrangling

In [None]:
# examples of how to manipulate datetime object:
train_data['Year']  = train_data['time'].astype('datetime64').dt.year
train_data['Month_of_year']  = train_data['time'].astype('datetime64').dt.month
train_data['Week_of_year'] = train_data['time'].astype('datetime64').dt.weekofyear
train_data['Day_of_year']  = train_data['time'].astype('datetime64').dt.dayofyear
train_data['Day_of_month']  = train_data['time'].astype('datetime64').dt.day
train_data['Day_of_week'] = train_data['time'].astype('datetime64').dt.dayofweek
train_data['Hour_of_week'] = ((train_data['time'].astype('datetime64').dt.dayofweek) * 24 + 24) - (24 - train_data['time'].astype('datetime64').dt.hour)
train_data['Hour_of_day']  = train_data['time'].astype('datetime64').dt.hour

In [None]:
# how to drop multiple columns from a dataframe:
train_data = train_data.drop(columns=['Week_of_year','Day_of_year','Hour_of_week','time'])

In [None]:
### example of how to see interaction between categorical features
import pandas as pd
title_df = pd.read_csv('https://raw.githubusercontent.com/Explore-AI/Public-Data/master/Data/regression_sprint/titanic_train_title.csv')
pd.crosstab(index=title_df['Title'], columns=title_df['Sex'])

In [None]:
## encode categorical variables for input into ML Model:
def dummy_encode_titles(input_df):
    # your code here
    df = input_df.copy()
    static = pd.get_dummies(df['Title'], dummy_na=False, drop_first=True, prefix_sep='_',prefix='Title')
    return static

encode_df = dummy_encode_titles(title_df)
dummy_cols = [col for col in encode_df if col.startswith('Title')]
encode_df[dummy_cols].head()

In [None]:
# five number summary taking a list as input
from numpy import percentile
def five_num_summary(items):
    # your code here
    d = dict()
    d['max'] = round(max(items),2)
    d['median'] = round(percentile(items, [50])[0],2)
    d['min'] = round(min(items),2)
    d['q1'] = round(percentile(items, [25])[0],2)
    d['q3'] = round(percentile(items, [75])[0], 2)

#### data visualizations

In [None]:
# Import libraries 
import matplotlib.pyplot as plt #used for plotting data 
import numpy as np #used for mathematical operations
import pandas as pd #used to loading CSV data

In [None]:
#Plot a donut chart

# Initially we create a pie chart as the base of our donut chart. 
plt.pie(tips, labels=meal_time, autopct='%1.1f%%', startangle=140)

# Next, we create a circle at the center of the base plot
centre_circle = plt.Circle((0,0),0.77, fc='white',linewidth=1.25)
fig = plt.gcf() # <-- Matplotlib command to get the current figure for further manipulation. 
# Add the circle to our base pie chart
fig.gca().add_artist(centre_circle)

plt.legend(loc="best")
plt.axis('equal')
plt.show()

In [None]:
#line graph
total_meals = {'31/01/1990': 1340, '28/02/1990': 1338, '31/03/1990': 1330, '30/04/1990': 1328, '31/05/1990': 1335, '30/06/1990': 1335}
dates = list(total_meals.keys()) # Extract the dates (the dictionary keys of our data in this case)
x_ax = [date[3:5] for date in dates] # Extract the month from each date string
y_ax = list(total_meals.values()) # Extract the total number of meals consumed on each date as a Python list

# Plot the line graph
plt.plot(x_ax, y_ax, color='green') 

# Set axis and graph titles
plt.xlabel('Month')
plt.ylabel('Number of Total meals sold')
plt.title('Line Graph Showing the Total Number of Meals Sold Over the First 6 Months of 1990 \n')

plt.show()

In [None]:
#bar plot
plt.bar(week_day, bill, color= 'orange')

# Set x and y axis titles
plt.ylabel('Total Bill')
plt.xlabel('\n Days of the Week(Thur-Sun)') # Note: '\n' creates a newline (try removing it and see what happens)  

# Set graph title
plt.title('Total bill of customers for Thur-Sun \n')

# Show graph
plt.show()

In [None]:
# scatter plot
# For this plot, we need to access the underlying Axes object used to create our chart. 
# To display our data correctly, we also set the `figsize` argument to increase the size of the plot. 
fig, ax = plt.subplots(figsize=(10,5))

# Create the scatter plot, with the 'size' variable being coded as the marker colour. 
# We set the `alpha` parameter to make the markers slightly transparent to view overlapping points. 
scatter = ax.scatter(df['total_bill'], df['tip'], c=df['size'], alpha=0.8)

# We now create our legend based upon the underlying group size and colour assignments.
ax.legend(*scatter.legend_elements(), loc="best", title="Group Size")

# Set graph and axis titles
plt.title('Scatter Plot Showing the Average Amount Tipped vs Group Size \n')
plt.xlabel('Bill Total ($)')
plt.ylabel('Amount Tipped ($)')

plt.show()

In [None]:
# pie chart
modelling_data['target'].value_counts().plot(kind="pie", autopct="%.2f")
plt.show()

In [None]:
# 3 box plots side by side
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
sns.boxplot(ax=axes[0], data=modelling_data_0, x='target', y='GSM_RECHARGE_D_M1', showfliers = False).set_title('GSM recharge')
sns.boxplot(ax=axes[1], data=modelling_data_0, x='target', y='MPESA_RECHARGE_D_M1', showfliers = False).set_title('Mpesa recharge')
sns.boxplot(ax=axes[2], data=modelling_data_0, x='target', y='gsm_tenure', showfliers = False).set_title('Tenure')

#### train, test split

In [None]:
train, test = train_test_split(modelling_data_111.drop(['Unnamed: 0','sample_id','rag_opt_year_month','rag_opt_year_month_str', 'Age_at_optin_mths',
                                                     'msisdn','pmt_meth_cd','connection_dt','frst_cl_zone',
                                                      'frst_cl_site_name','frst_cl_sales_cluster','frst_cl_region',
                                                      'manufacturer','model','os','gadget_type','device_category',
                                                      'smart_phone_flag','device_type','last_used_prev_monthend_dt',
                                                      'last_used_monthend_dt','last_used_yearmonth_str',
                                                       'success_count','ninety_count'], axis=1,errors='ignore'), test_size=0.2, random_state=113)

# or
from sklearn.model_selection import train_test_split
train, test = train_test_split(modelling_data_111[CANDIDATE_VARS]
                               , test_size=0.5, random_state=113)

In [None]:
y = train[['target']] #
y = y.astype('float')
y = y.values.ravel()

X = train.select_dtypes(include=['int64','int32','float','uint8'])
X = X.drop(["target"], axis=1)

y_test = test[['target']] #
y_test = y_test.astype('float')
y_test = y_test.values.ravel()

X_test = test.select_dtypes(include=['int64','int32','float','uint8'])
X_test = X_test.drop(["target"], axis=1)

In [1]:
from sklearn.inspection import permutation_importance

### Feature importance using permutation
result = permutation_importance(model, X_test, y_test, n_repeats=3,random_state=113)
### The result has mean, std and and array of lower, median and upper bound of the values...
feature_importance = pd.DataFrame(result.importances_mean,index = X.columns,columns=['importance']).sort_values('importance',ascending=False)

NameError: name 'model' is not defined

#### example of fitting a model

In [None]:
model = XGBClassifier(n_estimators=150, max_depth=4, use_label_encoder=False,eval_metric='logloss', random_state=113)
model.fit(X, y)

#### performance evaluation

In [None]:
### Classification report
from sklearn.model_selection import cross_val_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score

rfc_predict_test = (model.predict_proba(X_test[X.columns])[:,1] >= 0.288).astype(bool)
rfc_cv_score = cross_val_score(model, X_test[X.columns], y_test, cv=5, scoring='roc_auc')

print("=== Confusion Matrix : Test ===")
print(confusion_matrix(y_test, rfc_predict_test))
print('\n')
print("=== Classification Report ===")
print(classification_report(y_test, rfc_predict_test))
print('\n')

In [None]:
# heatmap.
plt.figure(figsize=(24, 10))
# Store heatmap object in a variable to easily access it when you want to include more features (such as title).
# Set the range of values to be displayed on the colormap from -1 to 1, and set the annotation to True to display the correlation values on the heatmap.
heatmap = sns.heatmap(X_test.corr(), vmin=-1, vmax=1, annot=True)
# Give a title to the heatmap. Pad defines the distance of the title from the top of the heatmap.
heatmap.set_title('Correlation Heatmap', fontdict={'fontsize':12}, pad=12);

In [None]:
# ### ROC
from sklearn.metrics import plot_roc_curve

fig = plot_roc_curve( model, X, y, name="train")
fig = plot_roc_curve( model, X_test, y_test, ax = fig.ax_, name="test")
#fig = plot_roc_curve( logRegModel, X_test_oot, y_test_oot, ax = fig.ax_, name="OOT Dates / name")
fig.figure_.suptitle("ROC curve comparison")
plt.show()

In [None]:
predicted_probas = model.predict_proba(X_test)
kds.metrics.plot_cumulative_gain(y_test, predicted_probas[:,1])

In [None]:
kds.metrics.plot_lift(y_test, predicted_probas[:,1])