<a href="https://colab.research.google.com/github/matthewpecsok/4482_fall_2022/blob/main/tutorials/Titanic%20data%20exploration%20tutorial%20-%20EDA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# The sinking of the RMS Titanic is one of the most infamous shipwrecks in history.
# On April 15, 1912, during her maiden voyage, the Titanic sank after colliding with 
# an iceberg, killing 1502 out of 2224 passengers and crew.
# This sensational tragedy shocked the international community and led to better 
# safety regulations for ships.One of the reasons that the shipwreck led to such 
# loss of life was that there were not enough lifeboats for the passengers and crew. 
# Although there was some element of luck involved in surviving the sinking, 
# some groups of people such as women, children, and the upper-class 
# were more likely to survive than others.

# VARIABLE DESCRIPTIONS:

# PassengerID     Unique passenger identifier
# Survived        Survival (0 = No; 1 = Yes)
# Pclass          Passenger Class(1 = 1st; 2 = 2nd; 3 = 3rd) (Pclass is a proxy for socio-economic status (SES)
#                     1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower)
# Name            Name
# Sex             Sex
# Age             Age (Age is in Years; Fractional if Age less than One (1) If the Age is Estimated, it is in the form xx.5)
# Sibsp           Number of Siblings/Spouses Aboard
# Parch           Number of Parents/Children Aboard
# Ticket          Ticket Number
# Fare            Passenger Fare
# Cabin           Cabin
# Embarked        Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)



# 1 Set up, data import, inspection and transformation



## 1.1 Set up, data import and inspection



### Import a csv file

In [None]:
titanic = pd.read_csv("https://raw.githubusercontent.com/matthewpecsok/4482_fall_2022/main/data/titanic.train.csv")

### Examine the overall data frame

### info() shows the number of observations, and the number, names, types of columns

In [None]:
titanic.info()

###head() shows you the first 5 observations

In [None]:
titanic.tail()

### You can retrieve and save the number of rows and number of columns of a data frame

In [None]:
print(titanic.shape[0]) # shape gives you the same information as nrow and ncol the 0th position is rows, the 1st position is columns

rows = titanic.shape[0]
print(rows)

In [None]:
col = titanic.shape[1]
col

### describe

In [None]:
titanic.describe(include='all') # similar to summary in R shows the mean, sd and the five-number statistics indicating the spread of each column's values
# missing is the NA count which is a nice feature of summary in R, also missing are the column types. but you do get std deviation.
# not quite as nice as summary which gives multiple counts for character type fields

### null values

In [None]:
titanic.isnull().sum()

### display first few rows or last few

In [None]:
titanic[400:406]

In [None]:
titanic.head(1) # in R head(titanic, n=1) 

In [None]:
titanic[0:1] #titanic[1,]

In [None]:
titanic.head(10) 

In [None]:
titanic.tail(10) 

### get help on function

In [None]:
help(titanic.head) 


## 1.2 Data transformation

### drop columns with no beneficial information for the model

In [None]:
# Remove unique identifiers from further analysis as they are not interesting without additional feature extractions


#replace the dataframe with a dropped version of the dataframe
titanic = titanic.drop(['PassengerId','Name','Ticket'],axis=1)



In [None]:
titanic

### numpy handles NA values

In [None]:
df = pd.DataFrame(data=[1,2,3,4,5,np.NaN,np.NAN],columns=['test'])
print(df)

print(np.std(df.test))
print(np.mean(df.test))

# numpy is perfectly happy to calculate standard deviation with and exclude nulls

### drop rows with missing values in age

In [None]:

# Remove observations with missing Age values. 
# This missing data handling approach has the obvious disadvantages of 
# the applicability of the model to data with missing age.
# To keep observations with missing Age values require careful imputation of Age missingness. 
# The various missing data imputation methods are beyond the knowledge required for this tutorial.

# Missing values in Age can cause problems. Test the following commands when you have time.
# var(titanic$Age)
# mean(titanic$Age)
# var(titanic[-which(is.na(titanic$Age)), ]$Age)
# sd(titanic[-which(is.na(titanic$Age)), ]$Age)

# Use which() command to identify and remove observations in which 
# the Age value is missing or is.na(titanic Age) = TRUE.

print(np.std(titanic.Age))

print(titanic.shape[0]) 
titanic = titanic.dropna(subset=['Age']) 
print(titanic)

In [None]:
titanic[titanic['Cabin']=='B96 B98']

### count values in columns

In [None]:
pd.DataFrame(['a','b','a','c','c','c','c','c','c','d','d','d','d',],columns=['letter']).letter.value_counts() # conveniently sorts for you

In [None]:
titanic.Cabin.value_counts() # conveniently sorts for you

### barplot of Cabin

In [None]:
titanic.Cabin.value_counts(dropna=True).plot(kind='bar')

In [None]:
titanic[pd.isnull(titanic.Cabin)] 

### replace null with 'missing'

In [None]:
titanic['Cabin'] = titanic['Cabin'].replace(np.nan, 'missing')

### now show it worked

In [None]:
titanic.Cabin.value_counts(dropna=False)

### replace null with missing and show it worked 

In [None]:
titanic['Embarked'] = titanic['Embarked'].replace(np.nan, 'missing')
titanic.Embarked.value_counts(dropna=False)

In [None]:
titanic[['Age','Sex']].describe(include='all')

# 2 Understanding numeric variables

In [None]:
titanic.Pclass.value_counts()

In [None]:
titanic[['Sex','Age']].describe(include='all')

### obtain the mean, median, max, min and range of a numeric variable

In [None]:


print(np.mean(titanic.Age))
print(np.median(titanic.Age))
print(np.min(titanic.Age))
print(np.max(titanic.Age))

In [None]:
range_age = np.max(titanic.Age)-np.min(titanic.Age)
print("range of diff: "+str(range_age))

In [None]:
# min max normalization of first observation
(titanic.Age[0]-np.min(titanic.Age))/range_age

In [None]:
titanic.Age.describe()

In [None]:
titanic.Age.describe(percentiles=[0,0.5,1])

In [None]:
titanic.Age.describe(percentiles=np.linspace(0,1,6))

In [None]:
titanic.Age.describe(percentiles=np.linspace(0,1,11))

In [None]:
titanic.Age.plot(kind='box',title='Boxplot of Age in the titanic data set')
plt.plot()

In [None]:
a = titanic.Age.plot(kind='hist',title='Histogram of Age in the titanic data set')
plt.show()
a = titanic.SibSp.plot(kind='hist',title='Histogram of SibSp in the titanic data set')
plt.show()

In [None]:
def num_descriptives(column_name):
  print(titanic[column_name].describe(percentiles=np.linspace(0,1,11)))
  print(titanic[column_name].describe(percentiles=[0,0.5,1]))
  titanic[column_name].plot(kind='box',title='Boxplot of %s in the titanic data set' % (column_name))
  plt.show()
  titanic[column_name].plot(kind='hist',title='Histogram of %s in the titanic data set' % (column_name))
  plt.show()

num_descriptives('SibSp')


In [None]:
for column in ['Age','SibSp','Fare','Parch']:
  num_descriptives(column)

# 3 Exploring categorical/factor values

### value counts

In [None]:
titanic.Survived.value_counts()

## barplot

In [None]:
titanic.Survived.value_counts().plot(kind='bar',title='Bar plot of Survived. Showing Counts')
plt.show()

In [None]:
round(titanic.value_counts('Survived',normalize=True),ndigits=2)

In [None]:
titanic.plot.scatter('SibSp','Parch')

In [None]:
titanic[['SibSp','Parch']]

In [None]:
titanic[['SibSp','Parch']].cov()

In [None]:
np.std(titanic[['SibSp']])

In [None]:
np.var(titanic[['SibSp']])

In [None]:
np.std(titanic[['Parch']])

In [None]:
np.var(titanic[['Parch']])

# 4 Exploring the relationship of Multiple Variables
ie numeric by categorical, numeric by numeric etc. 

## correlation between numeric variables.

In [None]:
titanic[['SibSp','Parch']].corr()

In [None]:
### Generate the correlation matrix of all numeric variables
titanic[['Age','SibSp','Parch','Fare']].corr()

## python equivalent of pairs.panel scatter plot

In [None]:
pd.plotting.scatter_matrix(titanic[['Age','SibSp','Parch','Fare']],figsize=(15, 10),alpha=0.2)
plt.show()

In [None]:
titanic.drop('Survived',axis=1).corr()

In [None]:
titanic.info()

In [None]:
sns.pairplot(titanic,hue='Survived')

In [None]:
g = sns.PairGrid(titanic,hue="Survived",hue_kws={"alpha": [0.2,0.2]})
g = g.map_diag(sns.kdeplot,shade=True)
g = g.map_lower(plt.scatter)

g = g.add_legend()

In [None]:
titanic.Cabin

In [None]:
 titanic.boxplot(column=['Age'],by=['Survived']) 
 plt.title( 'Boxplot of Age by Survived' )
 plt.suptitle('')
 plt.show()
 
 titanic.boxplot(column=['Parch'],by=['Survived']) 
 plt.title( 'Boxplot of Parch by Survived' )
 plt.suptitle('')
 plt.show()

titanic.boxplot(column=['SibSp'],by=['Survived']) 
 plt.title( 'Boxplot of SibSp by Survived' )
 plt.suptitle('')
 plt.show()




In [None]:
titanic.groupby('Survived')[['SibSp']].agg(['mean','median','min','max',lambda x: np.percentile(x, q = 75)])

In [None]:
from mpl_toolkits.mplot3d import Axes3D
from matplotlib.colors import ListedColormap
# axes instance
fig = plt.figure(figsize=(6,6))
ax = Axes3D(fig)

# get colormap from seaborn
cmap = ListedColormap(sns.color_palette("husl", 256).as_hex())

# plot
sc = ax.scatter(titanic.Parch, titanic.Age, titanic.Fare, s=40, c=(titanic.Survived), marker='o', alpha=.5)
ax.set_xlabel('Parch')
ax.set_ylabel('Age')
ax.set_zlabel('Fare')

# legend
plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2)

In [None]:
titanic_categorical = titanic.copy()

titanic_categorical.loc[titanic_categorical['Sex'] == 'female','Female'] = 1
titanic_categorical.loc[titanic_categorical['Sex'] == 'male', 'Female'] = 0


In [None]:
titanic_categorical

In [None]:
# axes instance
fig = plt.figure(figsize=(6,6))
ax = Axes3D(fig)

# get colormap from seaborn
cmap = ListedColormap(sns.color_palette("husl", 256).as_hex())

# plot
sc = ax.scatter(titanic_categorical.Fare, titanic_categorical.Pclass, titanic_categorical.Female, s=40, c=(titanic_categorical.Survived), marker='o', alpha=.5)
ax.set_xlabel('Fare')
ax.set_ylabel('Pclass')
ax.set_zlabel('Female')

# legend
plt.legend(*sc.legend_elements(), bbox_to_anchor=(1.05, 1), loc=2)