In [None]:
# Libraries to be used in the model
import pandas as pd
import numpy as py

# Ploting libraries
import matplotlib.pyplot as plt

# Model Libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

# Metrics or statistics to use
from statistics import mean
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, accuracy_score, roc_auc_score, mean_squared_error,confusion_matrix,classification_report

######################################3

# Get unique values per column
def print_unique_values_per_column(d):
    for c in d.columns:
        if (d[c].nunique() < 30):
            print(c, d[c].nunique(), d[c].dtype, d[c].unique())
        else:
            print(c, d[c].nunique(), d[c].dtype)

# Print missing values
def print_missing_values(d):
    missing_total = d.isnull().sum().sort_values(ascending=False)
    missing_percent = (d.isnull().sum()/d.isnull().count()).sort_values(ascending=False)
    missing = pd.concat([missing_total, missing_percent], axis=1, keys=['Total', 'Percent'])
    missing = missing[missing['Percent'] > 0]
    print(missing)

def get_columns_with_nulls(d):
    missing_total = d.isnull().sum().sort_values(ascending=False)
    missing_percent = (d.isnull().sum()/d.isnull().count()).sort_values(ascending=False)
    missing = pd.concat([missing_total, missing_percent], axis=1, keys=['Total', 'Percent'])
    missing = missing[missing['Percent'] > 0]
    return missing.index.values

# Custom accuracy evaluation
def accuracy_score(estimator, X, y):
    estimator.fit(X, y)
    y_pred = estimator.predict(X)
    accuracy = mean(1 - mean_absolute_error(y, y_pred)/y_pred)
    return accuracy

In [12]:
# Reading the data
test = pd.read_csv('test.csv')
train = pd.read_csv('train_data.csv')
train.head()

Unnamed: 0,ofd_date,country_code,fc_codes,station_code,OFD,Slam,Earlies_Exp,Earlies_Rec,MNR_SNR_Exp,Rollover,Returns,R_Sideline,Sideline
0,2021-06-30,C,"F6, F8, F14, F17",D33,14594,14568,782,896,615,767,35,2,4
1,2021-06-30,C,"F6, F8, F9, F14, F17, F18",D37,12736,13111,655,823,211,29,17,2,1
2,2021-06-30,C,"F1, F4, F6, F7, F13, F15, F16",D34,14562,15651,1028,1910,225,35,47,3,1
3,2021-06-30,C,"F2, F6, F7, F10, F12, F13, F14, F15, F19",D45,11165,11467,514,769,56,39,29,0,1
4,2021-06-30,C,"F6, F8, F13, F14, F17",D50,10006,10423,399,842,52,60,65,1,1


In [13]:
test.head()

Unnamed: 0.1,Unnamed: 0,ofd_date,country_code,fc_codes,station_code,OFD,Slam,Earlies_Rec,Rollover,Returns,R_Sideline,Sideline
0,0,2021-07-31,C,"F6, F8, F14, F17",D33,10231,10957,347,43,42,2,629
1,1,2021-07-31,C,"F6, F8, F9, F14, F17, F18",D37,10113,10261,285,50,25,0,165
2,2,2021-07-31,C,"F1, F4, F6, F7, F13, F15, F16",D34,11633,11789,300,24,30,0,202
3,3,2021-07-31,C,"F2, F6, F7, F10, F12, F13, F14, F15, F19",D45,8633,8482,180,713,24,1,276
4,4,2021-07-31,C,"F6, F8, F13, F14, F17",D50,6972,7222,116,24,16,0,130


In [16]:
# DATA TRANSFORMATION

## Create the target column
train["Target"] = train['Earlies_Exp'] - train['MNR_SNR_Exp']
train.head()

Unnamed: 0,ofd_date,country_code,fc_codes,station_code,OFD,Slam,Earlies_Exp,Earlies_Rec,MNR_SNR_Exp,Rollover,Returns,R_Sideline,Sideline,Target
0,2021-06-30,C,"F6, F8, F14, F17",D33,14594,14568,782,896,615,767,35,2,4,167
1,2021-06-30,C,"F6, F8, F9, F14, F17, F18",D37,12736,13111,655,823,211,29,17,2,1,444
2,2021-06-30,C,"F1, F4, F6, F7, F13, F15, F16",D34,14562,15651,1028,1910,225,35,47,3,1,803
3,2021-06-30,C,"F2, F6, F7, F10, F12, F13, F14, F15, F19",D45,11165,11467,514,769,56,39,29,0,1,458
4,2021-06-30,C,"F6, F8, F13, F14, F17",D50,10006,10423,399,842,52,60,65,1,1,347


In [17]:
## Create the column of days of the week with the following codes
"""
0 = Monday
1 = Tuesday
2 = Wednesday
3 = Thrusday
4 = Friday
5 = Saturday
6 = Sunday
""" 
train["ofd_date"] = pd.to_datetime(train["ofd_date"])
train["dayOdWeek"] = train['ofd_date'].dt.day_of_week
#train["dayOfWeekName"] = train['ofd_date'].dt.day_name()
train.head()

Unnamed: 0,ofd_date,country_code,fc_codes,station_code,OFD,Slam,Earlies_Exp,Earlies_Rec,MNR_SNR_Exp,Rollover,Returns,R_Sideline,Sideline,Target,dayOdWeek
0,2021-06-30,C,"F6, F8, F14, F17",D33,14594,14568,782,896,615,767,35,2,4,167,2
1,2021-06-30,C,"F6, F8, F9, F14, F17, F18",D37,12736,13111,655,823,211,29,17,2,1,444,2
2,2021-06-30,C,"F1, F4, F6, F7, F13, F15, F16",D34,14562,15651,1028,1910,225,35,47,3,1,803,2
3,2021-06-30,C,"F2, F6, F7, F10, F12, F13, F14, F15, F19",D45,11165,11467,514,769,56,39,29,0,1,458,2
4,2021-06-30,C,"F6, F8, F13, F14, F17",D50,10006,10423,399,842,52,60,65,1,1,347,2


In [18]:
## Get the country data to be separated
train = pd.get_dummies(train, prefix = ['country_'], columns = ['country_code'])
train.head()

Unnamed: 0,ofd_date,fc_codes,station_code,OFD,Slam,Earlies_Exp,Earlies_Rec,MNR_SNR_Exp,Rollover,Returns,R_Sideline,Sideline,Target,dayOdWeek,country__A,country__B,country__C,country__D
0,2021-06-30,"F6, F8, F14, F17",D33,14594,14568,782,896,615,767,35,2,4,167,2,0,0,1,0
1,2021-06-30,"F6, F8, F9, F14, F17, F18",D37,12736,13111,655,823,211,29,17,2,1,444,2,0,0,1,0
2,2021-06-30,"F1, F4, F6, F7, F13, F15, F16",D34,14562,15651,1028,1910,225,35,47,3,1,803,2,0,0,1,0
3,2021-06-30,"F2, F6, F7, F10, F12, F13, F14, F15, F19",D45,11165,11467,514,769,56,39,29,0,1,458,2,0,0,1,0
4,2021-06-30,"F6, F8, F13, F14, F17",D50,10006,10423,399,842,52,60,65,1,1,347,2,0,0,1,0


In [14]:
## Standarized the numerical data (if needed)
for column in ['OFD','Slam','Earlies_Exp','Earlies_Rec','MNR_SNR_Exp','Rollover','Returns','R_Sideline','Sideline']:
    train[column] = (train[column] - train[column].mean()) / train[column].std()
train.head()

Unnamed: 0,ofd_date,fc_codes,station_code,OFD,Slam,Earlies_Exp,Earlies_Rec,MNR_SNR_Exp,Rollover,Returns,R_Sideline,Sideline,Target,dayOdWeek,country__A,country__B,country__C,country__D
0,2021-06-30,"F6, F8, F14, F17",D33,-0.204823,-0.204666,0.227256,0.39833,0.839835,0.887818,-0.419134,-0.360889,-0.353328,167,2,0,0,1,0
1,2021-06-30,"F6, F8, F9, F14, F17, F18",D37,-0.456217,-0.405835,0.01717,0.278485,-0.010056,-0.336296,-0.478437,-0.360889,-0.359509,444,2,0,0,1,0
2,2021-06-30,"F1, F4, F6, F7, F13, F15, F16",D34,-0.209153,-0.055136,0.634195,2.063031,0.019396,-0.326344,-0.379598,-0.358879,-0.359509,803,2,0,0,1,0
3,2021-06-30,"F2, F6, F7, F10, F12, F13, F14, F15, F19",D45,-0.668779,-0.632822,-0.216076,0.189832,-0.336128,-0.319709,-0.438901,-0.364911,-0.359509,458,2,0,0,1,0
4,2021-06-30,"F6, F8, F13, F14, F17",D50,-0.825596,-0.776968,-0.406312,0.309677,-0.344543,-0.284877,-0.320294,-0.3629,-0.359509,347,2,0,0,1,0


In [20]:
## Try without fc_codes and station_code
train = train.drop(columns=['ofd_date','fc_codes', 'station_code','Earlies_Exp','MNR_SNR_Exp'])

In [None]:
### fc_code cleaning
# train.fc_codes = train.fc_codes.apply(lambda x: x.strip())
# train = pd.concat([train, train['fc_codes'].str.get_dummies(sep=',')], axis=1)

In [None]:
# station_code cleaning
# l= len(train['station_code'].unique())
# train.station_code = train.station_code.apply(lambda x: x.strip())
# station_stats = train.groupby('station_code')['station_code'].agg('count')
# station_stats

In [9]:
l= len(train['fc_codes'].unique())
l

60

In [28]:
#Check the ammount of null data
train.isna().sum()

OFD            0
Slam           0
Earlies_Exp    0
Earlies_Rec    0
MNR_SNR_Exp    0
Rollover       0
Returns        0
R_Sideline     0
Sideline       0
Target         0
dayOdWeek      0
country__A     0
country__B     0
country__C     0
country__D     0
dtype: int64

In [15]:
#Check the correlation of the variables with the target column
corr = train.corr()
corr["Target"].sort_values(ascending=False)

Target         1.000000
Earlies_Exp    0.755037
Earlies_Rec    0.400709
OFD            0.290058
Sideline       0.265911
Slam           0.245184
dayOdWeek      0.114565
country__A     0.098102
country__B     0.096786
country__C     0.040469
Returns        0.036127
R_Sideline     0.026542
Rollover      -0.024841
country__D    -0.231282
MNR_SNR_Exp   -0.552011
Name: Target, dtype: float64

In [16]:
#Preparing the variables for the model.
y = train['Target']
X = train.drop(['Target'], axis=1)

In [17]:
Y

1.0

NameError: name 'x' is not defined