# Tracy Developer Meetup 
## January 17, 2020

<div class="alert alert-block alert-info"><h2>Data Exploration with Pandas</h2></div>

https://pandas.pydata.org/index.html

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv("noshowappointments.csv", parse_dates=['AppointmentDay', "ScheduledDay"])

In [None]:
df.shape

In [None]:
df.dtypes

In [None]:
df.head(10)

In [None]:
df.describe(include="all")

In [None]:
#df.describe()

In [None]:
#df.describe(include="object")

In [None]:
#df.describe(exclude=["int64","float64","object"])

In [None]:
df = df.drop(["PatientId","AppointmentID"], axis=1)

In [None]:
df.describe()

In [None]:
# create a new feature by calculating the day of the week

df["AppointmentDayOfWeek"] = df["AppointmentDay"].dt.day_name()
df.head()


In [None]:
# we should only have 7 or less unique values
df["AppointmentDayOfWeek"].unique()

In [None]:
#remove timestamp from ScheduledDay
df["ScheduledDay"] = df["ScheduledDay"].dt.normalize()

In [None]:
# create a new feature by calculating days between Scheduled Day and Appointment Day
df["DaysBetweenScheduleAndAppointment"] = df["AppointmentDay"] -df["ScheduledDay"]
df['DaysBetweenScheduleAndAppointment'] = df['DaysBetweenScheduleAndAppointment']/np.timedelta64(1,'D')

In [None]:
df['DaysBetweenScheduleAndAppointment'].describe()

<div class="alert alert-block alert-info"><h2>MatPlotLib and Seaborn</h2></div>

https://matplotlib.org/

<div class="alert alert-block alert-warning"><h4>A closer look at DaysBetweenScheduledandAppointment</h4></div>

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns; sns.set()

In [None]:
df["DaysBetweenScheduleAndAppointment"].describe()

In [None]:
fig, ax = plt.subplots()
ax.hist(df["DaysBetweenScheduleAndAppointment"])
plt.show()

In [None]:
df_filtered = df.loc[df["DaysBetweenScheduleAndAppointment"] < 0]

In [None]:
fig, ax = plt.subplots()
ax.hist(df_filtered["DaysBetweenScheduleAndAppointment"])
plt.show()

In [None]:
df = df.loc[df["DaysBetweenScheduleAndAppointment"] >= 0]

In [None]:
df.shape

In [None]:
#remove appointmentDay and Scheduled Day (new records will never match a date that has already passed)
df = df.drop(["AppointmentDay","ScheduledDay"], axis=1)

<div class="alert alert-block alert-warning"><h4>A closer look at Age</h4></div>

In [None]:
# taking a closer look at Age
df["Age"].describe()

In [None]:
import seaborn as sns
sns.set(style="darkgrid")

In [None]:
df_filtered_age = df.loc[df["Age"] < 0]

In [None]:
fig, ax = plt.subplots()
ax.hist(df_filtered_age["Age"])
plt.show()

In [None]:
# removing record with negative age
df = df.loc[df["Age"] >= 0]

In [None]:
df.shape

https://seaborn.pydata.org/

<div class="alert alert-block alert-warning"><h4>One-hot encoding</h4></div>

In [None]:
#one hot encoding for Gender, AppointmentDayOfWeek
df = pd.get_dummies(df, columns=["AppointmentDayOfWeek","Gender"])

In [None]:
df.head()

In [None]:
#one hot encoding for Neighbourhood
df = pd.get_dummies(df, columns=["Neighbourhood"])
# df = df.drop(["Neighbourhood"], axis=1)

In [None]:
df.head()

<div class="alert alert-block alert-warning"><h4>Target Variable</h4></div>

In [None]:
# convert target to boolean
df["No-show"].unique()

In [None]:
df["No-show"] = df["No-show"].map({'Yes':1, 'No':0})

In [None]:
df["No-show"].unique()

In [None]:
# move target to end of the data frame
df["no-show-boolean"] = df["No-show"]
df = df.drop(["No-show"], axis=1)

In [None]:
#data model is ready for machine learning algorith

<div class="alert alert-block alert-info"><h2>Machine Learning with Scikit-Learn</h2></div>

https://scikit-learn.org/stable/

In [None]:
#import scikit-learn
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [None]:
# insert the values of all of the features into X 
X = df.iloc[:,:-1].values

In [None]:
X[:5]

In [None]:
# insert the values of the target into y
y = df.iloc[:,-1].values

In [None]:
y[:500]

### Split the dataset into train set and test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, test_size=0.3)

In [None]:
X_train.shape, X_test.shape

## Logistic Regression

In [None]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression(solver='lbfgs')
log_reg.fit(X_train, y_train)
log_reg.predict(X_test)

In [None]:
log_reg.score(X_test, y_test)

In [None]:
log_reg.predict_proba(X_test)[:50]

In [None]:
log_reg.fit(X, y)

In [None]:
from sklearn.externals import joblib

In [None]:
joblib.dump(LogisticRegression,"final_log_reg_model.sav")

## Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier()
tree.fit(X_train, y_train)
tree.score(X_test, y_test)