In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.cluster import KMeans
from sklearn.linear_model import LogisticRegression, SGDClassifier, SGDRegressor
import time
import warnings
warnings.filterwarnings('ignore')

In [None]:
train_data = pd.read_csv("/Users/mohd/Desktop/data/train.csv")
test_data = pd.read_csv("/Users/mohd/Desktop/data/test.csv")

## Functions

In [None]:
def calculateAllPercentages(df):
    tot = {}
    cols = list(df.columns)
    cols.remove('MachineIdentifier')
    cols.remove('HasDetections')
    for col in cols:
        before = df[col].count()
        after = df[col][df["HasDetections"] == 1].count()
        tot[col] = after/before
    
    return tot

## Plotting

In [None]:
tot = calculateAllPercentages(train_data)

In [None]:
index = 0
index_to_col = {}
cols = list(train_data.columns)
cols.remove('MachineIdentifier')
cols.remove('HasDetections')

x = []
y = []
for i in cols:
    index_to_col[index] = i
    x.append(index)
    y.append(tot[i])
    index += 1

In [None]:
plt.plot(x, y)

## Get numrical training and test data (features)

In [None]:
cols_num = []
cols_others = []
dtypes = train_data.dtypes
for i in range(len(dtypes)):
    if dtypes[i] in ['int64', 'float64']:
        cols_num.append(dtypes.index[i])
    else:
        cols_others.append(dtypes.index[i])

train_nums = train_data[cols_num].fillna(0)
cols_others.remove('MachineIdentifier')
train_others = train_data[cols_others]
test_others = test_data[cols_others]


In [None]:
cols_num.remove('HasDetections')
test_nums = test_data[cols_num].fillna(0)

In [None]:
features = train_nums[cols_num]
results = train_nums['HasDetections']

In [None]:
train_nums.head()

## Loading data

In [None]:
n = len(features)
x_train = features[:n]
y_train = results[:n]

x_test = test_nums[:n]

## Analyze non-number features

In [None]:
train_others.head()

In [None]:
t = {}
for col in cols_others:
    unique_vals = list(set(train_others[col]))
    t[col] = {}
    index = 0
    for val in unique_vals:
        t[col][val] = index
        index += 1


## Logistic regression (scaling data)

In [None]:
x_train1 = sklearn.preprocessing.scale(x_train)
x_test1 = sklearn.preprocessing.scale(x_test)

In [None]:
log_reg1 = LogisticRegression(random_state=0, solver='lbfgs', multi_class="multinomial").fit(x_train1, y_train)


In [None]:
predict1 = log_reg1.predict(x_test1)


In [None]:
d1 = {'MachineIdentifier': test_data['MachineIdentifier'][:n], 'HasDetections': predict1}
output1 = pd.DataFrame(data=d1)


In [None]:
output1.to_csv("output1.csv", index=False)

In [None]:
predict11 = log_reg1.predict(x_train1)

print("Score: {:.3f}".format(sum(predict11 == y_train)/n))

## Logistic regression (normalizing data)

In [None]:
x_train2 = sklearn.preprocessing.scale(x_train)
x_test2 = sklearn.preprocessing.scale(x_test)

In [None]:
log_reg2 = LogisticRegression(random_state=0, solver='lbfgs', multi_class="multinomial").fit(x_train2, y_train)


In [None]:
predict2 = log_reg2.predict(x_test2)


In [None]:
d2 = {'MachineIdentifier': test_data['MachineIdentifier'][:n], 'HasDetections': predict2}
output2 = pd.DataFrame(data=d2)


In [None]:
output2.to_csv("output2.csv", index=False)

In [None]:
predict22 = log_reg2.predict(x_train2)

print("Score: {:.3f}".format(sum(predict22 == y_train)/n))

## Logistic regression (data not changed)

In [None]:
log_reg3 = LogisticRegression(random_state=0, solver='lbfgs', multi_class="multinomial").fit(x_train, y_train)


In [None]:
predict3 = log_reg3.predict(x_test)


In [None]:
d3 = {'MachineIdentifier': test_data['MachineIdentifier'][:n], 'HasDetections': predict3}
output3 = pd.DataFrame(data=d3)


In [None]:
output3.to_csv("output3.csv", index=False)

In [None]:
predict33 = log_reg3.predict(x_train)

print("Score: {:.3f}".format(sum(predict33 == y_train)/n))

## SGD

In [None]:
sgd1 = SGDClassifier(loss="hinge", penalty="l2", max_iter=5).fit(x_train, y_train)


In [None]:
predict4 = sgd1.predict(x_test)

In [None]:
d4 = {'MachineIdentifier': test_data['MachineIdentifier'][:n], 'HasDetections': predict4}
output4 = pd.DataFrame(data=d4)


In [None]:
output4.to_csv("output4.csv", index=False)

In [None]:
predict44 = sgd1.predict(x_train)
print("Score: {:.3f}".format(sum(predict44 == y_train)/n))


## SGD + LR

In [None]:
sgd2 = SGDRegressor().fit(x_train, y_train)

In [None]:
predict5 = sgd2.predict(x_test)

In [None]:
d5 = {'MachineIdentifier': test_data['MachineIdentifier'][:n], 'HasDetections': predict5}
output5 = pd.DataFrame(data=d5)


In [None]:
output5.to_csv("output5.csv", index=False)

In [None]:
predict55 = sgd2.predict(x_train)
print("Score: {:.3f}".format(sum(predict55 == y_train)/n))

In [None]:
np.round(predict55)