# Session 3

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

%matplotlib inline

## Matplotlib (cont.)

In [None]:
x = np.arange(0, 1000) # 100000
y = np.random.randint(0, 100, size=1000)
z = 10*np.random.randn(1000)+50

In [None]:
plt.style.use("ggplot") # seaborn
a = plt.hist(y, bins=20, alpha=0.5)
plt.hist(z, bins=20, alpha=0.5)
plt.show()

In [None]:
print("frequencies: {}".format(a[0]))
print("bins start point: {}".format(a[1]))


In [None]:
plt.bar(x, y)
plt.show()

In [None]:
y = y.tolist()
y.append(250)
plt.boxplot(y)
plt.show()

detect anomalies

In [None]:
max_x, min_x = 10, -10
mu = 0
sigma = 0.7

x = np.linspace(min_x, max_x, 1000)
y = stats.norm.pdf(x, loc=mu, scale=sigma)
y_c = stats.norm.cdf(x, loc=mu, scale=sigma)

In [None]:
plt.figure(figsize=(14, 8))
plt.plot(x, y)
plt.plot(x, y_c)

plt.legend(["pdf", "cdf"], loc=2)
plt.show()

## Pandas

In [None]:
import pandas as pd

In [None]:
df = pd.read_csv("./data/titanic_train.csv")
df = df.set_index("PassengerId")
df.head()

In [None]:
df.loc[4] # index

In [None]:
df.iloc[3][2] # position

In [None]:
df.Age > 80

In [None]:
df[df.Age>=70]

In [None]:
df = pd.read_csv("./data/titanic_train.csv", index_col="PassengerId")
# df = df.reset_index()
df.head()

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
df.isnull().sum().sum()

In [None]:
df.describe().transpose()

In [None]:
df.columns

In [None]:
df = df.drop(['Ticket'], axis=1)

In [None]:
age_mean = df['Age'].mean()
print(age_mean)

In [None]:
embark_mode = (df["Embarked"].mode().values[0])
print(embark_mode)

In [None]:
df['Age'] = df['Age'].fillna(age_mean)
df['Embarked'] = df['Embarked'].fillna(embark_mode)
df['Cabin'] = df['Cabin'].fillna("U")

In [None]:
print(type(df[["Age"]]))
print(type(df["Age"]))

In [None]:
pd.get_dummies(df["Sex"], prefix="sex")

In [None]:
TITLE_LIST = ['Mrs', 'Mr', 'Master', 'Miss', 'Major', 'Rev', 'Dr', 'Ms',
              'Mlle', 'Col', 'Capt', 'Mme', 'Countess', 'Don', 'Jonkheer']


def substring_exist(string, substrings):
    for substring in substrings:
        if str.find(string, substring) != -1:
            return substring
    print(string)
    return np.nan

def replace_titles(x):
    title = x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        # return 'Mr'
        return 0
    elif title in ['Countess', 'Mme']:
        # return 'Mrs'
        return 0.75
    elif title in ['Mlle', 'Ms']:
        # return 'Miss'
        return 1
    elif title == 'Dr':
        if x['Sex'] == 'Male':
            # return 'Mr'
            return 0
        else:
            # return 'Mrs'
            return 0.75
    else:
        # return title
        return 0.5


In [None]:
df['Title'] = df['Name'].map(
    lambda x: substring_exist(x, TITLE_LIST))

In [None]:
CABIN_LIST = ['A', 'B', 'C', 'D', 'E', 'F', 'T', 'G', 'U']
def replace_deck(x):
    deck = x['Deck']
    if deck == "U":
        return 0
    elif deck == "G":
        return 0.15
    elif deck == "F":
        return 0.3
    elif deck == "E":
        return 0.45
    elif deck == "D":
        return 0.6
    elif deck == "C":
        return 0.75
    elif deck == "B":
        return 0.9
    elif deck == "A":
        return 1.1
    elif deck == "T":
        return 1.2
    else:
        return -1

In [None]:
def replace_embark(x):
    embarked = x['Embarked']
    if embarked == "S":
        return 0
    elif embarked == "Q":
        return 1
    else:
        return -1

In [None]:
df['Title'] = df.apply(replace_titles, axis=1)
df['Embarked'] = df.apply(replace_embark, axis=1)
df['Deck'] = df['Cabin'].map(lambda x: substring_exist(x, CABIN_LIST))
df['Deck'] = df.apply(replace_deck, axis=1)


In [None]:
df['Family_Size'] = df['SibSp']+df['Parch']

In [None]:
df['Fare_Per_Person'] = df['Fare']/(df['Family_Size']+1)

In [None]:
def replace_sex(x):
    sex = x['Sex']
    if sex == "male":
        return 0
    else:
        return 1

In [None]:
df['Sex'] = df.apply(replace_sex, axis=1)

In [None]:
df["Pclass_inv"] = df["Pclass"].map(lambda x: 1/x)

In [None]:
df["Age"].plot(kind="hist")


In [None]:
std_dev = np.std(df["Age"])
df["Age_norm"] = -(df.Age - age_mean)/std_dev

df.Age_norm.plot(kind="hist")


In [None]:
df.head()

In [None]:
df = df.drop("Name", axis=1)

In [None]:
df.tail()

In [None]:
df.sort_values(by="Age", ascending=False, na_position="first")

In [None]:
df.to_csv("./data/neat.csv")

In [None]:
# merge