<a href="https://colab.research.google.com/github/namhwui/LearnPython/blob/main/useful_tricks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Libraries

In [None]:
import numpy as np # numpy; math operations
import scipy as sp # scipy; has statistical functions and tests, including hierarchical clustering
import pandas as pd # pandas
import statsmodels as sm # statistical models with emphasis on parameter estimation and inference
import sklearn.model_selection # has train_test_split, GridSearchCV
import sklearn.linear_model # has regression models
from sklearn.preprocessing import StandardScaler # scaling
import auto_ts as at # automatic time series modelling; do 'pip install auto-ts' first 
import matplotlib.pyplot as plt # basic plotting library
import seaborn as sns # fancy wrapper for vis
import plotnine as p9 # ggplot in python

## Pandas, Numpy Tricks

In [None]:
# seach through dict via key
'key' in dct

# index_col = [0] in pd.read_csv sets 1st column as primary key

# get value associated with key
dct['key']

# get values from dictionary
dct.values()


# drop columns
data.drop('col', axis = 1, inplace = True)

# drop rows
data.dropna(axis=0)

# fill na
data.fillna(0)

# calculate multiple aggregate functions
df.groupby(['variable']).agg({
    'agg1': lambda x: # some function
    'agg2': 'count',
    'agg3': 'sum',
    'agg4':['mean', 'count']})

# rename columns in one go
# inplace = True mutates the data frame
df.rename(columns = {'agg1': 'col1',
                     'agg2': 'col2',
                     'agg3': 'col3'}, inplace=True)

# count the occurrence of each cell in grouped data frame
df.groupby(['a', 'b']).size()

# find rows with missing data
df.isnull().any(axis = 1)

# rename columns
df.rename(columns={'oldname1':'newname1', 'oldname2':'newname2'}, inplace = True)

# check data type of each column
df.dtypes
# change data type
df.column = df.column.astype('category')


# count the unique elements in a series and their occurrence
df['a'].value_counts()
np.unique('array',  return_counts = True)

# turns a (obs x column) dimensional data frame into  (obs x column) x 2 dimensional data,
# where first column is variable and second column is its value
pd.melt(df, id_vars = 'primary_key_here', var_name = 'column', value_name = 'value')

# simplify/recode labels
df['label2'] = df['label'].map({'label1':'else1', 'label2':'else1', 'label3':'else2'})

# slicing in data frame gives you rows
df[0:3] # gives rows 1,2

# flip boolean element-wise when locating inside data frame with ~
df[~(df.column == 'foo')]

# when combining multiple boolean arrays for locating, bracket each condition and bind with & or | (for 'or')

# find minimum element in a list and its position
min('array')
np.argmin('array')

# drop last two characters in list of strings
# 11th, 10th, 3rd, 8th, 7th: drop the last two
num_th = data.Processor_Gen.isin(['11th', '10th', '3rd', '8th', '7th'])
data.Processor_Gen[num_th] = data.Processor_Gen[num_th].str[:-2]

# capitalise all
data.Model = data.Model.str.upper()

# turn date string to datetime
data.date = pd.to_datetime(data.date) # this alone gives seconds too
data.date = data.date.dt.date # in days; change dt.date to dt.week, or dt.month
# get recency from date
import datetime
recency = max(df.date) + datetime.timedelta(days=1)

## Visualisation Tricks


### Matplotlib and Seaborn

In [None]:
# plot columns over a grid
plt.figure(figsize=(10, 10), dpi=80)
for i, column in enumerate(data.columns[:-1], 1): # all columns except last
    plt.subplot(4,3,i) # 4 = number of rows, 3 = number of columns
    sns.histplot(x=data[column], hue = data.another_column)
plt.tight_layout()
plt.show

# another way to plot over grid
fig, ax = plt.subplots(1, 4, figsize=(13,5))
for ii in range(len(Demographics)):
    sns.countplot(x = Demographics[ii], ax = ax[ii], data = data)
plt.tight_layout()
plt.show()

# yet another example of plotting over grid
fig, ax = plt.subplots(1, 3, figsize=(13,5))
sns.scatterplot(x='col1', y='col2', hue = 'col3', data = data, ax = ax[0], alpha = 0.25)
sns.scatterplot(x='col1', y='col2', hue = 'col3', data = data.loc[data.col3 == 'Yes', :], ax = ax[1])
sns.scatterplot(x='col1', y='col2', hue = 'col3', data = data.loc[data.col3 == 'No', :], ax = ax[2])
plt.tight_layout()
plt.show()

# facet grid in seaborn
# sharex = False gives each plot its own scale
g = sns.FacetGrid(data.melt(var_name = 'column'), col = 'column', col_wrap = 4, sharex = False)
g.map(sns.histplot, 'value')


# histogram
sns.histplot(x=data[column], hue = data.another_column)
# density plot
sns.displot(x=data[column], kde = True)
# scatterplot
sns.scatterplot(x='column1', y='column2', hue='optional', data=data)
# bar plot
sns.countplot(x = Demographics[ii], ax = ax[ii], data = data)
# box plot
sns.boxplot(x=data.col1, y=data[column], hue = data.col1)
# mosaic plot
from statsmodels.graphics.mosaicplot import mosaic
plt.figure(figsize=(30, 30), dpi=80)
mosaic(data, [Demographics[ii] for ii in [0, 1]], title = Demographics[0] + ' vs ' + Demographics[1])
# pair plot
sns.pairplot(data[Numerical])
# line graph on multiple numerical columns
data['Numerical columns'].set_index('index column').plot()
data['Numerical columns'].plot()


# line plot by cluster via melted data
df_nor_melt = pd.melt(df_normalized.reset_index(),
                      id_vars=['ID', 'Cluster'],
                      value_vars=['Recency','Frequency','MonetaryValue'],
                      var_name='Attribute',
                      value_name='Value')
df_nor_melt.head()
sns.lineplot('Attribute', 'Value', hue='Cluster', data=df_nor_melt)


### Plotnine

In [None]:
# Rating vs price, faceted by processor brand
(p9_base
 + p9.geom_point(p9.aes(x = 'Rating', y = 'Price'))
 + p9.labs(title = 'Rating vs Price',
           x = 'Rating',
           y = 'Price')
 + p9.facet_wrap('Processor_Brand'))

# another way to facet
# the formula notation dictates the direction in which facet goes
(p9.ggplot(data, p9.aes(x = 'reorder(Brand, Rate_Cost, fun = np.median)', y = 'Rate_Cost'))
  + p9.geom_boxplot()
  + p9.facet_grid('Processor_Brand ~ .'))


p9_base = p9.ggplot(data)
# grouped boxplot (by Brand variable)
# reordered by median rating
(p9_base
 + p9.geom_boxplot(p9.aes(x = 'reorder(Brand, Rating, fun = np.median)', y = 'Rating'))
 + p9.labs(title = "Rating by Brand",
           x = 'Brand',
           y = 'Rating'))

## Scipy Tricks

In [None]:
# F test for equality of mean over multiple groups
for column in data.columns[:-1]:
    a = data.loc[data.col1 == 3, column]
    b = data.loc[data.col2 == 4, column]
    c = data.loc[data.col3 == 5, column]
    d = data.loc[data.col4 == 6, column]
    e = data.loc[data.col5 == 7, column]
    f = data.loc[data.col6 == 8, column]
    print('F-test result for ' + column)
    print(sp.stats.f_oneway(a, b, c, d, e, f))

## Sklearn Tricks

### Standardising data

In [None]:
# scales to (column-wise) mean zero and standard deviation 1
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
data_scaled = scaler.transform(X)


### Train-test Split

In [None]:
from sklearn.model_selection import train_test_split

# partition the data to save testing set
X_train, X_test, y_train, y_test = train_test_split(data_scaled, 
                                                    data.quality_binary, 
                                                    test_size = 0.2, 
                                                    random_state = 42)

### Model Selection with GridSearchCV

In [None]:
# GridSearchCV searches over pre-set hyperparameter space for optimal one via CV
# output is the model fitted with optimal hyperparameter; so it has basic methods like fit, predict, etc.

from sklearn.model_selection import GridSearchCV
# recipe:
param_grid = {'param1': ..., 'param2': ...}
model = model_object(random_state = 111)
model_grid = GridSearchCV(model, param_grid = param_grid, cv = 5)
model_grid.fit(X_train, y_train)
model_grid.predict(X_test)
model_grid.predict_proba(X_test) # for models that can output probabilities

### Useful Models

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
GB = GradientBoostingClassifier()
GB_param_grid = {
    'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'max_depth': [2, 3, 4, 5]
}

from sklearn.linear_model import LogisticRegression
logistic = LogisticRegression()
logistic_param_grid = {
    Cs = np.linspace(0.01, 1)
}

from sklearn import svm
KSVM = svm.SVC()
KSVM_param_grid = {
    'kernel': 'kernels',
    'param_for_kernel':'kernel_specific',
    'etc':'etc'
}

from sklearn.linear_modle import LinearRegression
# not much to tune
linreg = LinearRegression()

from sklearn.linear_model import ElasticNet
EN_param_grid = {
    'alpha': [0.1, 0.5, 1, 2],
    'l1_ratio': np.linspace(0, 1, num = 11)
}
EN = ElasticNet(random_state = 42)


# Boosted regression
from sklearn.ensemble import GradientBoostingRegressor
GBreg_param_grid = {
    'learning_rate': [0.1, 0.15, 0.2, 0.25, 0.3],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'max_depth': [2, 3, 4, 5]
}
GBreg = GradientBoostingRegressor(random_state = 42)

# Robust regression with Huber loss
from sklearn.linear_model import HuberRegressor
HR_param_grid = {
    'epsilon': np.linspace(1.0, 2.0, num = 11),
    'alpha': np.linspace(0, 0.01, num = 11)
}
HR = HuberRegressor()


# Gaussian mixture
from sklearn.mixture import GaussianMixture
BIC = []

for g in range(6):
    model = GaussianMixture(n_components = g + 1, random_state = 42).fit(data_numeric)
    BIC.append(model.bic(data_numeric))

G = np.argmin(BIC) + 1
GMM = GaussianMixture(n_components = G, random_state = 42)
label = GMM.fit_predict(data_numeric)

### Performance Validation

### Regression

In [None]:
# unadjusted R^2 = SS_reg / SS_total
from sklearn.metrics import r2_score
print(round(r2_score(y_test, EN_pred), 3))

# RSSE (divide by len(y_test) for RMSE)
print(round(np.power(sum((y_test - EN_pred)**2), 0.5), 3))



### Binary Classification

In [None]:
def AUROC(truth, prob, pos_label):
    from sklearn.metrics import roc_curve
    from sklearn.metrics import RocCurveDisplay
    
    fpr, tpr, _ = roc_curve(truth, prob[:, 0], pos_label = pos_label)
    roc_display = RocCurveDisplay(fpr = fpr, tpr = tpr).plot()

# bunch of rates
# behave = custom metric based on behavioural economics; pain from loss is approx twice the joy from gain
# thus, false positive should be twice as painful as true positive is gainful
# confusion matrix:
# [[TP, FN],
#  [FP, TN]]

# recall/sensitivity = true positive rate = TP/(TP + FN)
# precision = TP/(TP + FP)
# false positive rate = type 1 error = FP/(FP+TN)
# power = 1 - type 2 error = recall
# specificity = true negative rate = TN/(TN + FP)
# f1 = harmonic mean of precision & sensitivity = precision*sensitivity/(precision + sensitivity)

def binary_classify_metric(truth, pred, pos_label, nround = 10):
    from sklearn.metrics import f1_score
    from sklearn.metrics import confusion_matrix
    
    f1 = f1_score(truth, pred, pos_label = pos_label)
    conf_mat = confusion_matrix(truth, pred)
    TPR = round(conf_mat[0, 0] / sum(conf_mat[0, :]), nround)
    FPR = round(conf_mat[1, 0] / sum(conf_mat[1, :]), nround)
    TNR = round(conf_mat[1, 1] / sum(conf_mat[1, :]), nround)
    FNR = round(conf_mat[0, 1] / sum(conf_mat[0, :]), nround)
    behave = 0.66 * FPR + 0.33 * TPR
    
    
    val = {'f1':f1, 'TPR':TPR, 'FPR':FPR, 'TNR':TNR, 'FNR':FNR, 'behave':behave}
    return val

### Clustering

In [None]:
# adjusted rand index
from sklearn.metrics import adjusted_rand_score
adjusted_rand_score(label1, label2)