# Lab | Cleaning numerical data 

## Import the necessary libraries.

In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import re

pd.set_option('display.max_columns', None)

## Load the we_fn_use_c_marketing_customer_value_analysis.csv into the variable customer_df

In [None]:
variable_customer_df = pd.read_csv('we_fn_use_c_marketing_customer_value_analysis.csv')

## First look at its main features (head, shape, info).

In [None]:
display(variable_customer_df.shape)

In [None]:
variable_customer_df.head()

In [None]:
variable_customer_df.info()

## Rename the columns so they follow the PE8 (snake case) (lower_case_with_underscores)

In [None]:
variable_customer_df.columns
column_names = variable_customer_df.columns
column_names

In [None]:
cols = []
for column in variable_customer_df.columns:
    cols.append(column.lower().replace(' ','_'))
cols
variable_customer_df.columns = cols

variable_customer_df.head()

## Change effective to date column to datetime format.

In [None]:
variable_customer_df['effective_to_date'] = pd.to_datetime(variable_customer_df['effective_to_date'], errors='coerce')

In [None]:
variable_customer_df

## Define a function that differentiates between continuous and discrete variables. Hint: The number of unique values might be useful. Store continuous data into a continuous_df variable and do the same for discrete_df

In [None]:
variable_customer_df_num = variable_customer_df.select_dtypes(include = np.number)
variable_customer_df_num.columns

In [None]:
for column in variable_customer_df.columns:
#     print(column, ":", variable_customer_df[column].unique()) # unique values for each column
    print(column, ":", len(variable_customer_df[column].unique())) # number of unique value for each column

In [None]:
# def value_counts(df):
#     for column in df.columns:
#         print(column, ":", len(df[column].unique())) # number of unique value for each column
#     return

In [None]:
# value_count(variable_customer_df_num)

In [None]:
def cont_disc(df): # df = dataframe to pass function to
        
    continuous_lst = [] # set up empty lists
    discrete_lst = []
    
    # column becomes the next column name
    for column in df.columns:               # < (df.shape[0] * 0.02) another option to differentiate: if the number of rows is less than cutoff it is discrete; more than cutoff, it is continuous
        if len(df[column].unique()) >= 202: # I chose 202 as a cutoff to differentiate the continuous and discrete variables.
            continuous_lst.append(column) #append column name to continuous
        else:
            discrete_lst.append(column)
    
    return continuous_lst, discrete_lst

In [None]:
continuous_var, discrete_var = cont_disc(variable_customer_df_num)

print('continuous=', continuous_var)
print('discrete=', discrete_var)

In [None]:
display(continuous_var)
display(discrete_var)

In [None]:
continuous_df = variable_customer_df_num[['customer_lifetime_value', 'income','monthly_premium_auto','total_claim_amount']]
continuous_df

In [None]:
discrete_df = variable_customer_df_num[['months_since_last_claim', 'months_since_policy_inception','number_of_open_complaints','number_of_policies']]
discrete_df

In [None]:
# return continuous and discrete dataframes inside function

# def cont_disc2(df): # df = dataframe to pass function to
        
#     continuous_lst = [] # set up empty lists
#     discrete_lst = []
    
#     # column becomes the next column name
#     for column in df.columns:               # < (df.shape[0] * 0.02) another option to differentiate: if the number of rows is less than cutoff it is discrete; more than cutoff, it is continuous
#         if len(df[column].unique()) >= 202: # I chose 202 as a cutoff to differentiate the continuous and discrete variables.
#             continuous_lst.append(column) #append column name to continuous
#         else:
#             discrete_lst.append(column)
    
#     continuous_df = df[continuous_lst]
#     discrete_df = df[discrete_lst]

#     return continuous_df, discrete_df

In [None]:
# continuous_df, discrete_df = disc_cont2(variable_customer_df_num)

In [None]:
# continuous_df

In [None]:
# discrete_df

## Plot a correlation matrix, comment what you see.

In [None]:
correlations = variable_customer_df.corr()
correlations.head()

In [None]:
fig, ax = plt.subplots(figsize=(20,20))
sns.heatmap(correlations, annot=True)
plt.show()

In [None]:
# There is no feature making high collinearity (such as 0.9%). No column was dropped.

## Create a function to plot every discrete variable. Do the same with the continuous variables (Be Careful, you may need to change the plot type to one better suited for continuous data!)

In [None]:
def plot(x):

    for column in x.columns:
        sns.displot(x[column], kde=True)
        plt.show()
    
    return plt

In [None]:
print(plot(discrete_df))

In [None]:
def plot_cont(x):

    for column in x.columns:
        x[column].hist()
        plt.show()
    
    return plt

In [None]:
print(plot_cont(continuous_df))

## Comment what you can see in the plots.

In [None]:
# Discrete variables: distribution plots show that month_since_last_claim and month_since_policy_inception
# has relatively equally distributed values. Number_of_open_complaints and number_of_policies have outliers.

# There are outliers in all continuous variables.

## Look for outliers in the continuous variables. (HINT: There’s a good plot to do that!)

In [None]:
def box_cont(x):

    for column in x.columns:
        sns.boxplot(y=x[column])
        plt.show()
    
    return plt

In [None]:
print(box_cont(continuous_df))

In [None]:
def outliers(column):
        
    q25 = np.percentile(sorted(column),25)

    q75 = np.percentile(sorted(column),75) 
        
    iqr = q75 - q25
    
    upper_limit = q75 + 1.5*iqr
    
    lower_limit = q25 - 1.5*iqr
        
    outliers = [x for x in column if x <= lower_limit or x >= upper_limit]
    
    return outliers

In [None]:
tca_outliers = outliers(continuous_df['total_claim_amount'])
tca_outliers

In [None]:
income_outliers = outliers(continuous_df['income'])
income_outliers # income column has no outlier

In [None]:
customer_lifetime_value_outliers = outliers(continuous_df['customer_lifetime_value'])
customer_lifetime_value_outliers

In [None]:
monthly_premium_auto_outliers = outliers(continuous_df['monthly_premium_auto'])
monthly_premium_auto_outliers

## Did you find outliers? Comment what you will do with them.

In [None]:
# Although 'outliers' function determines that the columns of continuous_df except 'income' have outliers, boxes plots do not suggest
# so many outliers in the columns of continuous_df.

## Check all columns for NaN values. Decide what (if anything) you will need to do with them.

In [None]:
# no null value in the variable_customer_df 
variable_customer_df.isnull().sum()

# Lab | Cleaning categorical data.

## Import the necessary libraries if you are starting a new notebook. Using the same data as the previous lab: we_fn_use_c_marketing_customer_value_analysis.csv

In [None]:
# Same file is imported in the beginning of the notebook.

## Find all of the categorical data. Save it in a categorical_df variable.

In [None]:
categorical_df = variable_customer_df.select_dtypes(include = object)
categorical_df.head()

## Check for NaN values.

In [None]:
# no null values
categorical_df.isnull().sum()

## Check all unique values of columns.

In [None]:
for column in categorical_df.columns:
    print(column, ":", categorical_df[column].unique()) # unique values for each column
    print(column, ":", len(categorical_df[column].unique())) # number of unique value for each column

## Check dtypes. Do they all make sense as categorical data?

In [None]:
categorical_df.dtypes

# All columns have categorical values. This implies that the dataframe was fully categorized into numerical and
# categorical parts.

In [None]:
categorical_df.head(20)

## Does any column contain alpha and numeric data? Decide how to clean it.



In [None]:
# Substitute NaNs with mode: this is not a great solution. In this way, we overemphasize the mode value of the column.
# We can improve a model (KNN classifier) to predict an intermediate target. 

In [None]:
# check for numeric values: there is no column that has only numeric values.
for column in categorical_df.columns:
    print(categorical_df[column].str.isnumeric().value_counts()) 

In [None]:
# check for alpha values: there are some columns (i.e. state, education) that have alpha values.
for column in categorical_df.columns: 
    print(categorical_df[column].str.isalpha().value_counts()) 

In [None]:
# check for alphanumeric characters: there are some columns that have alphanumeric characters.
for column in categorical_df.columns:
    print(categorical_df[column].str.isalnum().value_counts()) 

## Would you choose to do anything else to clean or wrangle the categorical data? Comment your decisions.

In [None]:
# We can apply qcut or cut techniques to divide the categorical data into bins.
# For categorical variables we can plot the relation between bins and check outliers.

## Compare policy_type and policy. What information is contained in these columns. Can you identify what is important?

In [None]:
# Personal Auto     6788
# Corporate Auto    1968
# Special Auto       378
# Name: policy_type, dtype: int64
# Personal L3     3426
# Personal L2     2122
# Personal L1     1240
# Corporate L3    1014
# Corporate L2     595
# Corporate L1     359
# Special L2       164
# Special L3       148
# Special L1        66
# Name: policy, dtype: int64

In [None]:
# Policy_type column is divided into three; personal auto, corporate auto, and special auto.
# The personal auto, corporate auto, and special auto are subdivided into three in policy column.
# Therefore, detailed information is given in policy column.
# In this case, policy column could be removed since all the information in policy column are also given in policy_type column as a simplified classification.

## Check number of unique values in each column, can they be combined in any way to ease encoding? Comment your thoughts and make those changes.

In [None]:
# The unique values in policy column can be combined. 
# The policy_type column already has the combined version of the unique values in policy column.

# Luxury SUV and Luxury Car can be combined to form Luxury vales in vehicle_class.

# Disable, retired, and medical leave can be combined and attached into Unemployed value in employment status.

# Bachelor and college can be combined to form undergraduate value and Master and Doctor can be combined to form graduate value.

In [None]:
for column in categorical_df.columns:
    print(categorical_df[column].value_counts()) 

In [None]:
# categorical_df = categorical_df.drop(['policy'], axis=1)

In [None]:
# categorical_df = categorical_df.replace({'vehicle_class': {'Luxury SUV': 'Luxury', 'Luxury Car': 'Luxury'}})

# categorical_df = categorical_df.replace({'employmentstatus': {'Disabled': 'Unemployed', 'Retired': 'Unemployed ', 'Medical Leave': 'Unemployed'}})

# categorical_df = categorical_df.replace({'education': {'Master': 'Graduate', 'Doctor': 'Graduate', 'Bachelor': 'Undergraduate','College': 'Undergraduate'}})

In [None]:
for column in categorical_df.columns:
    print(categorical_df[column].value_counts()) 

# Lab | Feature extraction

## Open the categoricals variable we created before.

In [None]:
categoricals = variable_customer_df.select_dtypes(np.object)
categoricals.head()

## Plot all the categorical variables with the proper plot. What can you see?

In [None]:
# There are 9134 unique variables in customer column. I do not evalute the plot of customer column.

# When I look to other plots of the categorical variables, the data is imbalanced except for gender column.

# Distribution of the values in the columns varies. Oversampling, undersampling or smote can be applied.

In [None]:
def plot_cat(df):
    for column in df.columns:
        sns.countplot(x=df[column])
        plt.show()
    return None

In [None]:
plot_cat(categoricals)

## There might be some columns that seem to be redundant, check their values to be sure. What should we do with them?

In [None]:
# The unique values in policy column can be combined. 

# The customer column has 9134 unique values. It comprises alphanumeric characters. We need to combine the unique values in customer column to decrease the number of unique values. 

# The policy_type column already has the combined version of the unique values in policy column.

# Luxury SUV and Luxury Car can be combined to form Luxury vales in vehicle_class.

# Disable, retired, and medical leave can be combined and attached into Unemployed value in employment status.

# Bachelor and college can be combined to form undergraduate value and Master and Doctor can be combined to form graduate value.

In [None]:
categoricals.isnull().sum() # no null values.

In [None]:
for column in categoricals.columns:
    print(column, ":", categoricals[column].unique()) # unique values for each column
    print(column, ":", len(categoricals[column].unique())) # number of unique value for each column

In [None]:
for column in categoricals.columns:
    print(categoricals[column].value_counts()) 

In [None]:
# Substitute NaNs with mode: this is not a great solution. In this way, we overemphasize the mode value of the column.
# We can improve a model (KNN classifier) to predict an intermediate target. 

In [None]:
# check for numeric values: there is no column that has only numeric values.
for column in categoricals.columns:
    print(categoricals[column].str.isnumeric().value_counts()) 

In [None]:
# check for alpha values: there are some columns (i.e. state, education) that have alpha values.
for column in categoricals.columns: 
    print(categoricals[column].str.isalpha().value_counts()) 

In [None]:
# check for alphanumeric characters: there are some columns that have alphanumeric characters.
for column in categoricals.columns:
    print(categoricals[column].str.isalnum().value_counts()) 

In [None]:
# We can apply qcut or cut techniques to divide the categorical data into bins.
# For categorical variables we can plot the relation between bins and check outliers.

## Plot time variable. Can you extract something from it?

In [None]:
# There are drops in each three or four month.

# At the end or middle of certain months, the policy could expire near the payment of checks.

# This might lead to drop in wages of employees. 

In [None]:
chart = sns.histplot(variable_customer_df['effective_to_date'])
plt.xticks(rotation=70)
plt.show()

# Lab | Comparing regression models

## In this final lab, we will model our data. Import sklearn train_test_split and separate the data.

In [None]:
from sklearn.model_selection import train_test_split

## We will start with removing outliers, if you have not already done so. We have discussed different methods to remove outliers. Use the one you feel more comfortable with, define a function for that. Use the function to remove the outliers and apply it to the dataframe.

In [None]:
def box_cont(x):

    for column in x.columns:
        sns.boxplot(y=x[column])
        plt.show()
    
    return plt

In [None]:
print(box_cont(continuous_df))

In [None]:
def outliers(column):
        
    q25 = np.percentile(sorted(column),25)

    q75 = np.percentile(sorted(column),75) 
        
    iqr = q75 - q25
    
    upper_limit = q75 + 1.5*iqr
    
    lower_limit = q25 - 1.5*iqr
        
    outliers = [x for x in column if x <= lower_limit or x >= upper_limit]
    
    return outliers

In [None]:
tca_outliers = outliers(continuous_df['total_claim_amount'])
tca_outliers

In [None]:
income_outliers = outliers(continuous_df['income'])
income_outliers # income column has no outlier

In [None]:
customer_lifetime_value_outliers = outliers(continuous_df['customer_lifetime_value'])
customer_lifetime_value_outliers

In [None]:
monthly_premium_auto_outliers = outliers(continuous_df['monthly_premium_auto'])
monthly_premium_auto_outliers

In [None]:
# The 'outliers' function determines that the columns of continuous_df except 'income' have outliers.

In [None]:
# Two solutions:
# 1. log transform is a way to deal with outliers
# 2. remove outliers

In [None]:
def log_transfom_clean(x):
    if x>0:
        return np.log(x)
    else:
        return np.NAN # We are returning NaNs so that we can replace them later

In [None]:
def plothist(col):
    pd.Series(variable_customer_df[col].apply(log_transfom_clean)).hist()
    plt.show()
    
    return None

In [None]:
plothist('total_claim_amount')

In [None]:
plothist('monthly_premium_auto')

In [None]:
plothist('customer_lifetime_value')

In [None]:
variable_customer_df['total_claim_amount'].mean()

In [None]:
variable_customer_df['total_claim_amount_transformed'] = variable_customer_df['total_claim_amount'].apply(log_transfom_clean)

# replace NaNs with mean of transformed data
variable_customer_df['total_claim_amount_transformed'] = variable_customer_df['total_claim_amount_transformed'].fillna(np.mean(variable_customer_df['total_claim_amount_transformed']))
variable_customer_df['total_claim_amount_transformed'].hist()
plt.show()

In [None]:
variable_customer_df['monthly_premium_auto'].mean()

In [None]:
variable_customer_df['monthly_premium_auto_transformed'] = variable_customer_df['monthly_premium_auto'].apply(log_transfom_clean)

# replace NaNs with mean of transformed data
variable_customer_df['monthly_premium_auto_transformed'] = variable_customer_df['monthly_premium_auto_transformed'].fillna(np.mean(variable_customer_df['monthly_premium_auto_transformed']))
variable_customer_df['monthly_premium_auto_transformed'].hist()
plt.show()

In [None]:
variable_customer_df['customer_lifetime_value'].mean()

In [None]:
variable_customer_df['customer_lifetime_value_transformed'] = variable_customer_df['customer_lifetime_value'].apply(log_transfom_clean)

# replace NaNs with mean of transformed data
variable_customer_df['customer_lifetime_value_transformed'] = variable_customer_df['customer_lifetime_value_transformed'].fillna(np.mean(variable_customer_df['customer_lifetime_value_transformed']))
variable_customer_df['customer_lifetime_value_transformed'].hist()
plt.show()

In [None]:
variable_customer_df['customer_lifetime_value_transformed'].mean()

In [None]:
variable_customer_df['monthly_premium_auto_transformed'].mean()

In [None]:
variable_customer_df['total_claim_amount_transformed'].mean()

In [None]:
variable_customer_df['total_claim_amount_transformed'].value_counts(dropna=False)

In [None]:
def ul_ll(column):
        
    q25 = np.percentile(sorted(column),25)

    q75 = np.percentile(sorted(column),75) 
        
    iqr = q75 - q25
    
    upper_limit = q75 + 1.5*iqr
    
    lower_limit = q25 - 1.5*iqr
        
    return upper_limit, lower_limit

In [None]:
income_upperl_lowerl = ul_ll(variable_customer_df['income'])
income_upperl_lowerl # income column has no outlier

In [None]:
total_claim_amount_upperl_lowerl = ul_ll(variable_customer_df['total_claim_amount_transformed'])
total_claim_amount_upperl_lowerl # income column has no outlier

In [None]:
monthly_premium_auto_upperl_lowerl = ul_ll(variable_customer_df['monthly_premium_auto_transformed'])
monthly_premium_auto_upperl_lowerl # income column has no outlier

In [None]:
customer_lifetime_value_upperl_lowerl = ul_ll(variable_customer_df['customer_lifetime_value_transformed'])
customer_lifetime_value_upperl_lowerl # income column has no outlier

In [None]:
variable_customer_df_customer_lifetime_valueNaN_dropped_1 = variable_customer_df[variable_customer_df['customer_lifetime_value_transformed'] <= 10.313001003057334].copy()

In [None]:
variable_customer_df_customer_lifetime_valueNaN_dropped_2 = variable_customer_df[variable_customer_df['customer_lifetime_value_transformed'] >= 7.0803778658926175].copy()

In [None]:
variable_customer_df_customer_lifetime_valueNaN_dropped = pd.concat([variable_customer_df_customer_lifetime_valueNaN_dropped_1,variable_customer_df_customer_lifetime_valueNaN_dropped_2],axis=0)
variable_customer_df_customer_lifetime_valueNaN_dropped

In [None]:
variable_customer_df_customer_lifetime_valueNaN_dropped_1 = variable_customer_df[variable_customer_df['monthly_premium_auto_transformed'] <= 5.399108147808699].copy()

In [None]:
variable_customer_df_customer_lifetime_valueNaN_dropped_2 = variable_customer_df[variable_customer_df['monthly_premium_auto_transformed'] >= 3.5117474395965522].copy()

In [None]:
variable_customer_df_customer_lifetime_valueNaN_dropped = pd.concat([variable_customer_df_customer_lifetime_valueNaN_dropped_1,variable_customer_df_customer_lifetime_valueNaN_dropped_2],axis=0)
variable_customer_df_customer_lifetime_valueNaN_dropped

In [None]:
variable_customer_df_customer_lifetime_valueNaN_dropped_1 = variable_customer_df[variable_customer_df['total_claim_amount_transformed'] <= 7.353347257270782].copy()

In [None]:
variable_customer_df_customer_lifetime_valueNaN_dropped_2 = variable_customer_df[variable_customer_df['total_claim_amount_transformed'] >= 4.558793261718426].copy()

In [None]:
variable_customer_df_customer_lifetime_valueNaN_dropped = pd.concat([variable_customer_df_customer_lifetime_valueNaN_dropped_1,variable_customer_df_customer_lifetime_valueNaN_dropped_2X = pd.concat([X_normalized, onehot_encoded], axis=1) X = pd.concat([X_normalized, onehot_encoded], axis=1) ],axis=0)
variable_customer_df_customer_lifetime_valueNaN_dropped

## Create a copy of the dataframe for the data wrangling.

In [None]:
data = variable_customer_df_customer_lifetime_valueNaN_dropped

In [None]:
data

## Normalize the continuous variables. You can use any one method you want.

In [None]:
y = data['total_claim_amount_transformed']
X = data.drop(['total_claim_amount_transformed'], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train_num = X_train.select_dtypes(include = np.number)
X_train_cat = X_train.select_dtypes(include = object)
X_test_num = X_test.select_dtypes(include = np.number)
X_test_cat = X_test.select_dtypes(include = object)

In [None]:
continuous_df = data[['customer_lifetime_value_transformed', 'income','monthly_premium_auto_transformed','total_claim_amount_transformed']]
continuous_df

In [None]:
from sklearn.preprocessing import MinMaxScaler

In [None]:
MinMaxtransformer = MinMaxScaler().fit(continuous_df)
X_normalized = MinMaxtransformer.transform(continuous_df)
print(type(X_normalized))
X_normalized = pd.DataFrame(X_normalized,columns=continuous_df.columns)
display(X_normalized.head())
print(type(X_normalized))

## Encode the categorical variables (See the hint below for encoding categorical data!!!)

In [None]:
discrete_df = data[['state','coverage','employmentstatus','location_code','marital_status','policy_type',
                    'policy','renew_offer_type','customer','months_since_last_claim', 
                    'months_since_policy_inception','number_of_open_complaints','number_of_policies',
                   'sales_channel','vehicle_class','vehicle_size']]
discrete_df

In [None]:
onehot_columns=data[['state','marital_status','policy_type',
                    'policy','renew_offer_type','customer','months_since_last_claim', 
                    'months_since_policy_inception','number_of_open_complaints','number_of_policies',
                   'sales_channel','vehicle_class']]

In [None]:
from sklearn.preprocessing import OneHotEncoder

In [None]:
encoder = OneHotEncoder(drop='first').fit(onehot_columns)

In [None]:
cols = encoder.get_feature_names(input_features=onehot_columns.columns)

In [None]:
print(encoder.categories_)

In [None]:
encoded = encoder.transform(onehot_columns).toarray()

In [None]:
print(encoded)

In [None]:
onehot_encoded = pd.DataFrame(encoded,columns=cols)

In [None]:
onehot_encoded.head()

In [None]:
ordinal = data[['coverage','employmentstatus','location_code','vehicle_size']]

In [None]:
ordinal["coverage"] = ordinal["coverage"].map({"Basic" : 0, "Extended" : 1, "Premium" : 2})

In [None]:
ordinal["employmentstatus"] = ordinal["employmentstatus"].map({"Employed" : 0, "Unemployed" : 1, "Medical Leave" : 2,"Disabled" : 3,"Retired" : 4})

In [None]:
ordinal["location_code"] = ordinal["location_code"].map({"Suburban" : 0, "Rural" : 1, "Urban" : 2})

In [None]:
ordinal["vehicle_size"] = ordinal["vehicle_size"].map({"Medsize" : 0, "Small" : 1, "Large" : 2})

In [None]:
onehot_encoded.reset_index()

In [None]:
onehot_encoded = onehot_encoded.loc[~onehot_encoded.index.duplicated(keep='first')]

In [None]:
ordinal.reset_index()

In [None]:
ordinal = ordinal.loc[~ordinal.index.duplicated(keep='first')]

In [None]:
onehotencoded = pd.concat([onehot_encoded,ordinal],axis=1)

In [None]:
onehotencoded

## The time variable can be useful. Try to transform its data into a useful one. Hint: Day week and month as integers might be useful.

In [None]:
data.dtypes

In [None]:
data['year'] = data['effective_to_date'].dt.year

In [None]:
data['month'] = data['effective_to_date'].dt.month

In [None]:
data['day'] = data['effective_to_date'].dt.day

In [None]:
data

## Since the model will only accept numerical data, check and make sure that every column is numerical, if some are not, change it using encoding.

In [None]:
onehotencoded = onehotencoded.loc[~onehotencoded.index.duplicated(keep='first')]

In [None]:
continuous_df = continuous_df.loc[~continuous_df.index.duplicated(keep='first')]

In [None]:
data['year'] = data['year'].loc[~data['year'].index.duplicated(keep='first')]
data.shape

In [None]:
data['month'] = data['month'].loc[~data['month'].index.duplicated(keep='first')]

In [None]:
data['day'] = data['day'].loc[~data['day'].index.duplicated(keep='first')]

In [None]:
date = data[['day','month','year']]

In [None]:
data_new = pd.concat([continuous_df, onehotencoded], axis=1)
data_new

In [None]:
data = data[~data.index.duplicated()]

In [None]:
date = date[~date.index.duplicated()]

In [None]:
data_new2 = pd.concat([data_new, date], axis=1)
data_new2

In [None]:
data_new2.dtypes

In [None]:
X_cat = data_new2.select_dtypes(include = object)
X_cat # no categoricals. all variables are numerical

In [None]:
# Scaling data
# we use the transformer that was trained on the training data
X_test_normalized = MinMaxtransformer.transform(X_test_num)
X_test_norm = pd.DataFrame(X_test_normalized)
X_test_norm.shape

In [None]:
#Encoding categoricals using previous encoder
#We do not need to fit again.
encoded = encoder.transform(X_test_cat).toarray()
cols = encoder.get_feature_names(input_features=X_test_cat.columns)
onehot_encoded_test = pd.DataFrame(encoded,columns=cols)

## Try a simple linear regression with all the data to see whether we are getting good results.

In [None]:
from sklearn import linear_model

In [None]:
lm = linear_model.LinearRegression()
lm.fit(X_train,y_train)

## Great! Now define a function that takes a list of models and train (and tests) them so we can try a lot of them without repeating code.

## Use the function to check LinearRegressor and KNeighborsRegressor.

## You can check also the MLPRegressor for this task!

## Check and discuss the results.

# Lab | Random variable distributions

## Get the numerical variables from our dataset.

## Check using a distribution plot if the variables fit the theoretical normal or exponential distribution.

## Check if any of the transformations (log-transform, etc.) we have seen up to this point changes the result.