In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

np.random.seed(42)

In [2]:
df = pd.read_csv('./data/data-regression.csv', index_col=False)

In [3]:
df.head(10)

Unnamed: 0,feature1,feature2,feature3,feature4,target
0,355.19,blue,,0.0,-143.065368
1,308.12,green,-16.278,8.0,-4.461594
2,500.51,green,,8.0,-0.835365
3,451.46,green,-8.159,,-34.189506
4,502.45,green,18.65,,26.118365
5,597.51,green,13.858,6.0,
6,238.03,,-98.232,5.0,97.595136
7,488.55,Red,-87.694,-12.0,72.342689
8,439.83,red,,-1.0,123.887711
9,591.54,green,5.088,,-13.667815


## Addressing any NaN values in the dataset

In [4]:
df.isna().sum()

feature1    10
feature2    10
feature3    10
feature4    10
target      10
dtype: int64

Observations with missing Target values should be dropped from the dataset.

In [5]:
df = df.dropna(subset=['target'])

In [6]:
df.isna().sum()

feature1     8
feature2    10
feature3     9
feature4     8
target       0
dtype: int64

If the number of missing values in an observation is high (relative to the number of features), then remove that observation from the dataset.

In [7]:
df.shape

(90, 5)

In [8]:
df = df.dropna(axis=1, thresh=0.50*(df.shape[0])) # Drop any column with more than 50% missing values

Any remaining missing values are filled with the mean (or median) of the column.

In [9]:
df.shape

(90, 5)

In [10]:
df = df.dropna(axis=0, thresh=int(0.25*(df.shape[1]-1))) # Drop any row with more than 25% missing values (the -1 is to account for the target column)

In [11]:
df.shape

(90, 5)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 90 entries, 0 to 99
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   feature1  82 non-null     float64
 1   feature2  80 non-null     object 
 2   feature3  81 non-null     float64
 3   feature4  82 non-null     float64
 4   target    90 non-null     float64
dtypes: float64(4), object(1)
memory usage: 4.2+ KB


In [13]:
for column in df.select_dtypes(include=['int', 'float']):
    df[column] = df[column].fillna(df[column].mean())

for column in df.select_dtypes(include=['object']):
    df = df.fillna(df[column].value_counts().index[0])


In [14]:
df.shape # count the remaining number of rows and columns

(90, 5)

## Addressing any Misspelled words

In [15]:
df['feature2'].unique()

array(['blue', 'green', 'Red', 'red'], dtype=object)

In [16]:
df = df.replace(['Red'], 'red') 

In [17]:
df_dummy = pd.get_dummies(
    df, 
    prefix_sep='_', 
    dummy_na=False, 
    drop_first=True, 
    columns=['feature2'], 
    dtype='int32'
)

df_dummy.head(10)

Unnamed: 0,feature1,feature3,feature4,target,feature2_green,feature2_red
0,355.19,-25.301938,0.0,-143.065368,0,0
1,308.12,-16.278,8.0,-4.461594,1,0
2,500.51,-25.301938,8.0,-0.835365,1,0
3,451.46,-8.159,4.304878,-34.189506,1,0
4,502.45,18.65,4.304878,26.118365,1,0
6,238.03,-98.232,5.0,97.595136,0,0
7,488.55,-87.694,-12.0,72.342689,0,1
8,439.83,-25.301938,-1.0,123.887711,0,1
9,591.54,5.088,4.304878,-13.667815,1,0
10,501.811341,-26.537,1.0,-130.716327,0,0


In [18]:
df_onehot = pd.get_dummies(
    df, 
    prefix_sep='_', 
    dummy_na=False, 
    drop_first=False, 
    columns=['feature2'], 
    dtype='int32'
)

df_onehot.head(10)

Unnamed: 0,feature1,feature3,feature4,target,feature2_blue,feature2_green,feature2_red
0,355.19,-25.301938,0.0,-143.065368,1,0,0
1,308.12,-16.278,8.0,-4.461594,0,1,0
2,500.51,-25.301938,8.0,-0.835365,0,1,0
3,451.46,-8.159,4.304878,-34.189506,0,1,0
4,502.45,18.65,4.304878,26.118365,0,1,0
6,238.03,-98.232,5.0,97.595136,1,0,0
7,488.55,-87.694,-12.0,72.342689,0,0,1
8,439.83,-25.301938,-1.0,123.887711,0,0,1
9,591.54,5.088,4.304878,-13.667815,0,1,0
10,501.811341,-26.537,1.0,-130.716327,1,0,0


In [19]:
features = df_dummy.drop(columns=['target'])
target = df_dummy['target']

X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

In [20]:
model = LinearRegression()
model.fit(X_train, y_train)

print(model.score(X_train, y_train))
print(model.score(X_test, y_test))


0.8113862100168268
0.7231514687374483
