# Data Analysis

In this step we play with the data. 

We check the data behavior:
1. Features analysis - distribution, outliers analysis and domain knowledge analysis.
2. Time series analysis - stationary, autocorrelation, trends, etc..
3. Target analysis:
    - Distribution
    - Target relationships
    - Features interactions with target
    - Target time series distribution
4. Correlations analysis
    - Pearson heatmap
    - Feature vs Feature relationships 
    - PCA
5. Features Importance
6. Regression Analysis


Next step is Data Processing Phase 2.

# Import 

In [None]:
# math operations
import pandas as pd
import numpy as np 
import scipy

import xgboost
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import statsmodels.api as sm

import matplotlib.pyplot as plt
import matplotlib
import matplotlib.gridspec as gridspec
import seaborn as sns

# plotly
import plotly
from plotly import tools
import plotly.graph_objs as go

# Init

In [None]:
# matplotlib font size
matplotlib.rcParams.update({'font.size': 22})

# seaborn dark theme
sns.set(style="dark")

# seaborn font scale
sns.set(color_codes=True)
sns.set(font_scale = 1.2)

# plotly offline
plotly.offline.init_notebook_mode(connected=True)

# pandas columns setting
pd.set_option('display.max_columns', 30)

# disable warning
import warnings
warnings.filterwarnings('ignore')

# notebook width
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

# Load Data

In [None]:
data_path = 'path'

# read data

# Features Analysis

## Analysis - Continuous

In [None]:
# numeric data types
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
# numeric columns
numerics_columns_types = df.select_dtypes(include=numerics).columns


# exclude boolean columns
exclude_cols = ['col1', 'col2']
exclude_fun = lambda y: any(x in y for x in exclude_cols)

numeric_cols = [col for col in numerics_columns_types if not exclude_fun(col)]
print(numeric_cols)

### Analysis

__col11:__

In [None]:
pd.DataFrame(df.col11.describe())

In [None]:
fig, axes = plt.subplots(1, 2,figsize=(20,8))

# both subplots title
fig.suptitle('All population')

# plot1 object
sns.distplot(df.col1, ax=axes[0])

# plot1 labels config
axes[0].set_title('col1 histogram')

# plot2 object
sns.boxplot(y=df.col1, ax=axes[1])

# plot2 labels config
axes[1].set_title('col1 boxplot')

# show plot
plt.show()

### Conclusions

* 
* 

__Next steps:__

* 
* 

## Analysis - Booleans

In [None]:
# numeric data types
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
# numeric columns
numerics_columns_types = df.select_dtypes(include=numerics).columns

# exclude boolean columns
exclude_fun = lambda y: any(x in y for x in ['is_', 'has_', 'have_'])
boolean_cols = [col for col in numerics_columns_types if exclude_fun(col)]

print(boolean_cols)

### Analysis

In [None]:
# get the precentage of True values precenatage
booleans_df = pd.DataFrame(df[boolean_cols].mean())
booleans_df.columns = ['true_precentage']
booleans_df = booleans_df.sort_values('true_precentage', ascending=False)

In [None]:
# plot booleans cols True values precenatage
plot = sns.barplot(x=booleans_df.index, y=booleans_df.true_precentage)

# plot2 size config
plot.figure.set_figwidth(10)
plot.figure.set_figwidth(20)

# rotate x labels to fit plot
plot.set_xticklabels(booleans_df.index, rotation=-45)

# titles
plot.set_title('Boolean features Distributions')
plot.set_xlabel('Feature')
plot.set_ylabel('Is True precentage')

plt.show()

### Conclusions

* 
* 

__Next steps:__

* 
* 

## Analysis - Discrete

In [None]:
discrete_cols = ['col1', 'col2']
print(discrete_cols)

### Analysis

__col1:__

High cardinality

In [None]:
pd.DataFrame(df.col1.describe())

In [None]:
fig, axes = plt.subplots(1, 2,figsize=(20,4))

# plot1 object
sns.distplot(df.col1, ax=axes[0])

# plot1 labels config
axes[0].set_title('col1 Histogram')

# plot2 object
sns.countplot(df.col1[df.col1>constant], ax=axes[1])

# plot2 labels config
axes[1].set_title('Right tail Histogram')

# show plot
plt.show()

__col2:__

Low cardinality

In [None]:
pd.DataFrame(df.col2.describe())

In [None]:
fig, axes = plt.subplots(1, 1,figsize=(20,8))

# plot1 object
sns.countplot(df.col2, ax=axes)

# plot1 size config
axes.figure.set_figwidth(14)
axes.figure.set_figheight(6)

# show plot
plt.show()

### Conclusions

* 
* 

__Next steps:__

* 
* 

## Analysis - Categorical data

### Analysis

In [None]:
cat_cols = df.describe(include='object').columns
print(cat_cols.tolist())

__col1:__

### Conclusions

* 
* 

__Next steps:__

* 
* 

## Analysis - Dates

### Analysis

__Years:__

In [None]:
years = df.date_col.dt.year.tolist()

In [None]:
fig, axes = plt.subplots(1, 1,figsize=(20,8))

# plot1 object
sns.countplot(years, ax=axes)

# plot1 size config
axes.figure.set_figwidth(6)
axes.figure.set_figheight(4)

# show plot
plt.show()

__Months:__

In [None]:
months = df.date_col.dt.month.tolist()

In [None]:
fig, axes = plt.subplots(1, 1,figsize=(20,8))

# plot1 object
sns.countplot(months, ax=axes)

# plot1 size config
axes.figure.set_figwidth(14)
axes.figure.set_figheight(4)

# show plot
plt.show()

__Days of month:__

In [None]:
days_of_month = df.date_col.dt.day.tolist()

In [None]:
fig, axes = plt.subplots(1, 1,figsize=(20,8))

# plot1 object
sns.countplot(days_of_month, ax=axes)

# plot1 size config
axes.figure.set_figwidth(14)
axes.figure.set_figheight(4)

# show plot
plt.show()

### Conclusions

* 
* 

__Next steps:__

* 
* 

# Time series Analysis

In [None]:
df_ts = df.copy()
df_ts['date'] = df_ts.date_col.dt.date

In [None]:
fig, axes = plt.subplots(1, 1,figsize=(20,4))

# plot1 object
sns.lineplot(x="date", y="entity", data=date_jobs_count_ts, ax=axes)

# titles
axes.set_title('entity count Time Series')
axes.set_xlabel('Date')
axes.set_ylabel('entity Count')

# show plot
plt.show()

## Conclusions

* 
* 

__Next steps:__

* 
* 

# Target Analysis

## Target Distribution

In [None]:
pd.DataFrame(df.target.describe())

In [None]:
# target distribution
fig, axes = plt.subplots(1, 2,figsize=(20,5))

# both subplots title
fig.suptitle('All population')


# plot1 object
sns.distplot(df.target, ax=axes[0])

# plot1 size config
# axes[0].figure.set_figwidth(14)
# axes[0].figure.set_figheight(6)

# plot1 labels config
axes[0].set_title('target histogram')


# plot2 object
sns.boxplot(y=df.target, ax=axes[1])

# plot2 labels config
axes[1].set_title('target boxplot')

# show plot
plt.show()

In [None]:
# constant tail plot
fig, axes = plt.subplots(1, 1,figsize=(20,4))

# plot1 object
sns.distplot(df.target[df.target>=const], kde=False, ax=axes)

# plot1 labels config
axes.set_title('target histogram > const')

# show plot
plt.show()

__Outliers analysis__

## Target Relationships

In [None]:
# get correlation of all features with target
data = pd.DataFrame(df.corr()['target'][df.corr().index!='target'])
data.target = data.target.abs()
data = data.sort_values('target', ascending=False)[:10]

fig, axes = plt.subplots(1, 1,figsize=(20,4))

# plot1 object
sns.barplot(x=data.index, y=data.target, ax=axes)

# rotate x labels to fit plot
axes.set_xticklabels(data.index, rotation=-45)

# titles
axes.set_title('Top 10 correlations with target')
axes.set_xlabel('Features')
axes.set_ylabel('Pearson correlation')

# show plot
plt.show()

__target VS. col1__

In [None]:
# plot1 object
plot = sns.jointplot(x="col1", y="total_clicks", data=df, kind="reg")

# show plot
plt.show()

__target VS. col2__

Ordinal Discrete values

In [None]:
plot = sns.catplot(x="col2", y="target", kind="point", data=df, height=5, aspect=2)

__target Vs. col3__

High cardinality (target statitics of each category)

In [None]:
col3_means = df.groupby(['col3'], as_index=False).agg({'target': 'mean'}).sort_values('target', ascending=False)

# plot1 object
plot = sns.catplot(x="col3", y="target", kind="bar", order=col3_means.col3[:20], height=5, aspect=4, data=df)

plot.set_xticklabels(col3_means.col3, rotation=-45)

# show plot
plt.show()

## Features interactions with target

In [None]:
plot = sns.lmplot(x="col1", y="target", 
                  hue="interaction_col", 
                  col='col2',
                  row='col3',
                  data=df, height=5, aspect=2)

plot.set(xlim=(,), ylim=(,))

## Target time series

In [None]:
# time serie df
target_ts = df.copy()


## Conclusions

* 
* 

__Next steps:__

* 
* 

# Correlation analysis

## High level analysis 

__Correlation Heatmap:__

In [None]:
heatmap_cols = []

In [None]:
plt.figure(figsize=(20, 12))
sns.heatmap(df[heatmap_cols].corr())
plt.show()

In [None]:
cor = df[numeric_cols + ['target']].corr().abs()

s = pd.DataFrame(cor.unstack().sort_values(kind="quicksort", ascending=False))

s[s[0]<1].head(10)


## Feature vs Feature analysis

In [None]:
# plot1 object
plot = sns.jointplot(x="col1", y="col2", data=df, kind="reg")

# show plot
plt.show()

In [None]:
g = sns.PairGrid(df[numeric_cols], palette="GnBu_d")
g.map(plt.scatter, s=50, edgecolor="white")

## Conclusions

* 
* 

__Next steps:__

* 
* 

## PCA analysis

In [None]:
# only numerical data
cols_to_drop = ['col1', 'col2']
X_pca = df.loc[:, ~df.columns.isin(cols_to_drop)].copy().dropna()

# standerize data
z_scaler = StandardScaler()
z_data = z_scaler.fit_transform(X_pca)

In [None]:
pca = PCA(n_components=z_data.shape[1])
pca.fit(z_data)

# explainded ratio
pca.explained_variance_ratio_  

In [None]:
variance = pca.explained_variance_ratio_ #calculate variance ratios

var=np.cumsum(np.round(pca.explained_variance_ratio_, decimals=3)*100)

plt.ylabel('% Variance Explained')
plt.xlabel('# of Features')
plt.title('PCA Analysis')
plt.ylim(pca.explained_variance_ratio_.min(), 100.5)
plt.style.context('seaborn-whitegrid')


plt.plot(var)
plt.show()

## Conclusions

* 
* 

__Next steps:__

* 
* 

# Features importance

In [None]:
cols_to_drop = ['col1', 'col2']
X_importance = df.loc[:, ~df.columns.isin(cols_to_drop)].copy().dropna()

In [None]:
forest = xgb.XGBRegressor(n_estimators=100,
                          random_state=7,
                          n_jobs=-1,
                          max_depth=10,
                          bootstrap=True)


forest.fit(X_importance.loc[:, ~X_importance.columns.isin(['target'])], X_importance['target'])
importances = forest.feature_importances_

In [None]:
importance_df = pd.DataFrame({'features': X.loc[:, ~X_importance.columns.isin(['total_clicks'])].columns, 'importances': importances})
importance_df.sort_values('importances', ascending=False).reset_index(drop=True)

## Conclusions

* 
* 

__Next steps:__

* 
* 

# Regression Analysis

In [None]:
y = X['target'].copy() ## Y usually means our output/dependent variable

X_reg = X.loc[:, ~X.columns.isin(['target'])].copy().dropna()

# standerize data
z_scaler = StandardScaler()
X_stand = z_scaler.fit_transform(X_reg)
X_stand = pd.DataFrame(X_stand, columns=X.columns.tolist())
X_stand = sm.add_constant(X_stand)

# Note the difference in argument order
model = sm.OLS(y.tolist(), X_stand).fit() ## sm.OLS(output, input)
predictions = model.predict(X_stand)

# Print out the statistics
model.summary()

In [None]:
reg_pvalues = pd.DataFrame(model.pvalues)
reg_pvalues.columns = ['pvalues']
reg_pvalues.pvalues = reg_pvalues.pvalues.abs()
reg_pvalues[reg_pvalues.pvalues>0.05].sort_values('pvalues', ascending=False)


In [None]:
reg_coef = pd.DataFrame(model.params)
reg_coef.columns = ['coef']
reg_coef.coef = reg_coef.coef.abs()
reg_coef.sort_values('coef', ascending=False)

## Conclusions

* 
* 

__Next steps:__

* 
* 

# Summary



## Features Analysis

### Continous Data:

* 
* 

__Next steps:__

* 
* 

### Boolean Data:

* 
* 

__Next steps:__

* 
* 

### Discrete Data:

* 
* 

__Next steps:__

* 
* 

### Categorical Data:

* 
* 

__Next steps:__

* 
* 

### Dates Data:

* 
* 

__Next steps:__

* 
* 

## Time Series Analysis:

* 
* 

__Next steps:__

* 
* 

## Target Analysis:

### Target distribution:

* 
* 

__Next steps:__

* 
* 

### Target relationships

* 
* 

__Next steps:__

* 
* 

### Features interaction with target

* 
* 

__Next steps:__

* 
* 

### Target time series

* 
* 

__Next steps:__

* 
* 

## Correlations:

### High level

* 
* 

__Next steps:__

* 
* 

### Feature vs feature

* 
* 

__Next steps:__

* 
* 

### PCA

* 
* 

__Next steps:__

* 
* 

## Features Importances

* 
* 

__Next steps:__

* 
* 

## Regression Analysis:

* 
* 

__Next steps:__

* 
* 
