# OUTLINE

## DROPPING MISSING VALUES & DUPLICATE ROWS

In [None]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)

## K-FOLD SPLIT OR TRAIN-TEST-SPLIT
**Note to self, do this before any sort of transformations!**

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.25, random_state=42)

## HANDLING CLASS IMBALANCE USING SMOTE

In [None]:
! pip install imbalanced-learn
import imblearn
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline

over = SMOTE(sampling_strategy=0.1)
under = RandomUnderSampler(sampling_strategy=0.5)
steps = [('o', over), ('u', under)]
pipeline = Pipeline(steps=steps)
# transform the dataset
X, y = pipeline.fit_resample(X, y)



## DROPPING NON-CORRELATED FEATURES

In [None]:
def correlation(dataset, threshold):
    col_corr = set()  # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]  # getting the name of column
                col_corr.add(colname)
    return col_corr


corr_features = correlation(df, 0.8)
len(set(corr_features))

# Drop the columns stored in the corr_features
df.drop(corr_features,axis=1, inplace=True)


## LABEL ENCODING

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit()
y_train_transformed = le.transform(y_train)
y_test_transformed = le.transform(y_test)
# le.classes_

## ONE-HOT ENCODING AND ORDINAL ENCODING

### ORDINAL ENCODING 

In [None]:
# return_df=True

In [None]:
from sklearn.preprocessing import LabelEncoder

# Ordinalencoding -> cough
oe = OrdinalEncoder(categories=[['Mild','Strong']])
X_train_cough = oe.fit_transform(X_train[['cough']])

# also the test data
X_test_cough = oe.fit_transform(X_test[['cough']])

X_train_cough.shape

### ONE-HOT ENCODING

In [None]:
from sklearn.preprocessing import OneHotEncoder

# OneHotEncoding -> gender,city
# ohe = OneHotEncoder(drop='first',dtype=np.int32)
ohe = OneHotEncoder(drop='first',sparse=False)

X_train_gender_city = ohe.fit_transform(X_train[['gender','city']])

# also the test data
X_test_gender_city = ohe.fit_transform(X_test[['gender','city']])

X_train_gender_city.shape

In [None]:
X_train_transformed = np.concatenate((X_train_gender_city,X_train_cough),axis=1)
# also the test data
X_test_transformed = np.concatenate((X_test_gender_city,X_test_cough),axis=1)

### COMBINE ALL OF THIS.....SIMPLE IS BETTER THAN COMPLEX!!!

In [3]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
transformer = ColumnTransformer(transformers=[
    ('num', SimpleImputer(strategy='median'), ['COL_NAME', 'ANOTHER_COL_NAME']),
    ('cat', SimpleImputer(strategy='most_frequent'), ['CON_NAME']),
    ('tnf1',SimpleImputer(),['fever']),
    ('tnf2',OrdinalEncoder(categories=[['Mild','Strong']]),['cough']),
    ('tnf3',OneHotEncoder(sparse=False,drop='first'),['gender','city'])
    ],
    remainder='passthrough')

X_train_all_transformed = transformer.fit_transform(X_train).shape
X_test_all_transformed = transformer.transform(X_test).shape

NameError: name 'OrdinalEncoder' is not defined

## MODEL BUILDING AND TESTING

### XgBoost

### CatBoost

### LightGBM

### FairGBM

### Random Forest

## SUBMISSION