In [1]:
%matplotlib inline
%load_ext autoreload

import warnings
warnings.filterwarnings("ignore") # disable warnings

from os import getcwd
from os.path import join, abspath, pardir, exists
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.figure_factory as ff
import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls

# scipy
from scipy.stats import ttest_ind, chi2_contingency, boxcox, skew
from scipy.stats.stats import pearsonr

# sklearn libraries
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier, LocalOutlierFactor
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.cluster import DBSCAN
from sklearn.feature_selection import SelectKBest, f_classif, chi2

# IPython
from IPython.core.interactiveshell import InteractiveShell

##### Config settings

In [None]:
parent_dir = abspath(join(join(getcwd(), pardir), pardir))
data_dir = join(parent_dir, "data")
data_file = join(data_dir, "speed_dating.csv")

# For IPython

InteractiveShell.ast_node_interactivity = "all" # To show all output after each cell execution (instead of the last output)

# For pandas

pd.options.display.max_columns = 200 # display upto 200 columns (instead of default 20)
pd.options.display.max_rows = 200 # display upto 200 rows (instead of default 60)

In [None]:
# df = pd.read_csv(data_file, encoding= 'unicode_escape')
df = pd.read_csv(data_file, encoding= 'ISO-8859-1')
df.head()

##### Basic checks

In [None]:
len(df)
len(df.columns)
# df.dtypes

##### Missing data

In [None]:
((df.isna().sum()/len(df)) * 100).to_frame(name='missing %').sort_values(by=['missing %'], ascending=False)

##### Profile Report

In [None]:
profile = ProfileReport(df, title="Speed Dating Report", minimal=True, vars={"num": {"low_categorical_threshold": 0}})
profile.to_notebook_iframe()

### Gender based analysis

Let's see what's the ratio b/w men and women participants

In [None]:
gender_df = df.gender.replace(0, "Men").replace(1, "Women").to_frame(name='gender')
sns.countplot(x='gender', data=gender_df)

##### Findings
- Same ratio between men and women 

</br>

In [None]:
pip install -U radian

In [None]:
df.columns.to_list()

#### Session wise exploration 

In [None]:
id = list(df['iid'].unique())[2]
df[df['iid'] == id].shape

for id in list(df['iid'].unique()):
    print("{} {}".format(id, df[df['iid'] == id].shape))

In [None]:
for pos in list(df['position'].unique()):
    women = len(list(df[(df['position'] == pos) & (df['gender'] == 0)]['iid'].unique()))
    men = len(list(df[(df['position'] == pos) & (df['gender'] == 1)]['iid'].unique()))
    print("Position: {}\tMen: {}\tWomen: {}".format(pos, men, women))
              
# len(list(df[(df['position'] == 1) & (df['gender'] == 1)]['iid'].unique()))

In [None]:
df[df['iid'] == 551].sort_values

In [None]:
id = df['iid'].unique()[0]

In [None]:
df['iid'].unique()

In [None]:
df[df['iid'] == id]