In [1]:
# Libraries to be used in the model
import pandas as pd
import numpy as py

# Ploting libraries
import matplotlib.pyplot as plt

# Model Libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

# Metrics or statistics to use
from statistics import mean
from sklearn import metrics
from sklearn.metrics import mean_absolute_error, accuracy_score, roc_auc_score, mean_squared_error,confusion_matrix,classification_report

######################################3

# Get unique values per column
def print_unique_values_per_column(d):
    for c in d.columns:
        if (d[c].nunique() < 30):
            print(c, d[c].nunique(), d[c].dtype, d[c].unique())
        else:
            print(c, d[c].nunique(), d[c].dtype)

# Print missing values
def print_missing_values(d):
    missing_total = d.isnull().sum().sort_values(ascending=False)
    missing_percent = (d.isnull().sum()/d.isnull().count()).sort_values(ascending=False)
    missing = pd.concat([missing_total, missing_percent], axis=1, keys=['Total', 'Percent'])
    missing = missing[missing['Percent'] > 0]
    print(missing)

def get_columns_with_nulls(d):
    missing_total = d.isnull().sum().sort_values(ascending=False)
    missing_percent = (d.isnull().sum()/d.isnull().count()).sort_values(ascending=False)
    missing = pd.concat([missing_total, missing_percent], axis=1, keys=['Total', 'Percent'])
    missing = missing[missing['Percent'] > 0]
    return missing.index.values

# Custom accuracy evaluation
def accuracy_score(estimator, X, y):
    estimator.fit(X, y)
    y_pred = estimator.predict(X)
    accuracy = mean(1 - mean_absolute_error(y, y_pred)/y_pred)
    return accuracy

In [2]:
# Reading the data
test = pd.read_csv('test.csv')
train = pd.read_csv('train_data.csv')
train.head()

Unnamed: 0,ofd_date,country_code,fc_codes,station_code,OFD,Slam,Earlies_Exp,Earlies_Rec,MNR_SNR_Exp,Rollover,Returns,R_Sideline,Sideline
0,2021-06-30,C,"F6, F8, F14, F17",D33,14594,14568,782,896,615,767,35,2,4
1,2021-06-30,C,"F6, F8, F9, F14, F17, F18",D37,12736,13111,655,823,211,29,17,2,1
2,2021-06-30,C,"F1, F4, F6, F7, F13, F15, F16",D34,14562,15651,1028,1910,225,35,47,3,1
3,2021-06-30,C,"F2, F6, F7, F10, F12, F13, F14, F15, F19",D45,11165,11467,514,769,56,39,29,0,1
4,2021-06-30,C,"F6, F8, F13, F14, F17",D50,10006,10423,399,842,52,60,65,1,1


In [3]:
#Check the ammount of null data
train.isna().sum()

ofd_date        0
country_code    0
fc_codes        0
station_code    0
OFD             0
Slam            0
Earlies_Exp     0
Earlies_Rec     0
MNR_SNR_Exp     0
Rollover        0
Returns         0
R_Sideline      0
Sideline        0
dtype: int64

In [4]:
# DATA TRANSFORMATION

## 1) Create the target column
train["Target"] = train['Earlies_Exp'] - train['MNR_SNR_Exp']

## 2) Standarized the numerical data (if needed)
for column in ['OFD','Slam','Earlies_Exp','Earlies_Rec','MNR_SNR_Exp','Rollover','Returns','R_Sideline','Sideline']:
    train[column] = (train[column] - train[column].mean()) / train[column].std()
train

## 3) Create the column of days of the week with the following codes
"""
0 = Monday
1 = Tuesday
2 = Wednesday
3 = Thrusday
4 = Friday
5 = Saturday
6 = Sunday
""" 
train["ofd_date"] = pd.to_datetime(train["ofd_date"])
train["dayOdWeek"] = train['ofd_date'].dt.day_of_week
#train["dayOfWeekName"] = train['ofd_date'].dt.day_name()

## 4) Create the colums per FC code.
train = pd.concat([train, train['fc_codes'].str.get_dummies(sep=', ')], axis=1)

# 5) Get the categorical data to be separated
train = pd.get_dummies(train, prefix = ['country_'], columns = ['country_code'])
train = pd.get_dummies(train, prefix = ['station_'], columns = ['station_code'])

# 6) Drop no relevant columns
train = train.drop(columns=['ofd_date','fc_codes'])

train.head()


Unnamed: 0,OFD,Slam,Earlies_Exp,Earlies_Rec,MNR_SNR_Exp,Rollover,Returns,R_Sideline,Sideline,Target,...,station__D73,station__D74,station__D75,station__D76,station__D77,station__D78,station__D79,station__D8,station__D80,station__D9
0,-0.204823,-0.204666,0.227256,0.39833,0.839835,0.887818,-0.419134,-0.360889,-0.353328,167,...,0,0,0,0,0,0,0,0,0,0
1,-0.456217,-0.405835,0.01717,0.278485,-0.010056,-0.336296,-0.478437,-0.360889,-0.359509,444,...,0,0,0,0,0,0,0,0,0,0
2,-0.209153,-0.055136,0.634195,2.063031,0.019396,-0.326344,-0.379598,-0.358879,-0.359509,803,...,0,0,0,0,0,0,0,0,0,0
3,-0.668779,-0.632822,-0.216076,0.189832,-0.336128,-0.319709,-0.438901,-0.364911,-0.359509,458,...,0,0,0,0,0,0,0,0,0,0
4,-0.825596,-0.776968,-0.406312,0.309677,-0.344543,-0.284877,-0.320294,-0.3629,-0.359509,347,...,0,0,0,0,0,0,0,0,0,0


In [5]:
#Check the correlation of the variables with the target column
corr = train.corr()
corr["Target"].sort_values(ascending=False)

Target         1.000000
Earlies_Exp    0.755037
Earlies_Rec    0.400709
OFD            0.290058
Sideline       0.265911
                 ...   
F42           -0.184277
F43           -0.194223
F48           -0.199170
country__D    -0.231282
MNR_SNR_Exp   -0.552011
Name: Target, Length: 150, dtype: float64

In [6]:
#Preparing the variables for the model.
Y = train['Target']
X = train.drop(['Target'], axis=1)

In [7]:
Y

0        167
1        444
2        803
3        458
4        347
        ... 
11304    425
11305    235
11306     99
11307    165
11308    249
Name: Target, Length: 11309, dtype: int64

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.60)
svclassifier = SVC(kernel='linear')
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))