In [142]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import numpy as np

pd.set_option("display.max_columns", None)

df = pd.read_csv("./grain-training.csv")

In [143]:
df

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,11366,423.114990,171.906647,85.579300,0.867278,11599,0.611404,Osmancik
1,16523,531.892029,224.995422,94.417702,0.907689,16911,0.577041,Cammeo
2,11088,418.208008,172.027420,82.935669,0.876112,11284,0.624993,Osmancik
3,14528,475.447998,192.198563,97.417427,0.862029,14795,0.629490,Cammeo
4,8990,389.377014,157.749603,73.919182,0.883418,9297,0.625261,Osmancik
...,...,...,...,...,...,...,...,...
3043,14078,478.470001,203.645462,88.560310,0.900491,14280,0.744395,Cammeo
3044,17246,540.541992,225.988861,98.573151,0.899857,17704,0.573929,Cammeo
3045,11070,419.403015,173.575043,82.154213,0.880898,11266,0.600586,Osmancik
3046,11747,452.127014,194.494858,78.744461,0.914376,11935,0.542637,Cammeo


In [144]:
def preprocess(df):
    # Remove duplicates
    df = df.drop_duplicates()

    
    imputer = SimpleImputer(strategy="median")
    df_numeric = df.select_dtypes(include=[np.number])
    df[df_numeric.columns] = imputer.fit_transform(df_numeric)

    df["Area_Product"] = df["Major_Axis_Length"] * df["Minor_Axis_Length"]


    return df

df = preprocess(df)

In [145]:
df

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class,Area_Product
0,11366.0,423.114990,171.906647,85.579300,0.867278,11599.0,0.611404,Osmancik,14711.650478
1,16523.0,531.892029,224.995422,94.417702,0.907689,16911.0,0.577041,Cammeo,21243.550681
2,11088.0,418.208008,172.027420,82.935669,0.876112,11284.0,0.624993,Osmancik,14267.209155
3,14528.0,475.447998,192.198563,97.417427,0.862029,14795.0,0.629490,Cammeo,18723.489453
4,8990.0,389.377014,157.749603,73.919182,0.883418,9297.0,0.625261,Osmancik,11660.721608
...,...,...,...,...,...,...,...,...,...
3043,14078.0,478.470001,203.645462,88.560310,0.900491,14280.0,0.744395,Cammeo,18034.905318
3044,17246.0,540.541992,225.988861,98.573151,0.899857,17704.0,0.573929,Cammeo,22276.434046
3045,11070.0,419.403015,173.575043,82.154213,0.880898,11266.0,0.600586,Osmancik,14259.921021
3046,11747.0,452.127014,194.494858,78.744461,0.914376,11935.0,0.542637,Cammeo,15315.392756


In [146]:
X = df.drop("Class", axis=1)
y = df[["Class"]]

ro = RandomOverSampler()
X_new, y_new = ro.fit_resample(X, y)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.30, random_state=10
)

model = LogisticRegression(random_state=10)


model.fit(X_train, y_train.values.ravel())

predictions = model.predict(X_test)


score = accuracy_score(y_test, predictions)

score

0.9245901639344263

In [147]:
holdout = pd.read_csv("./grain-holdout.csv")

holdout = preprocess(holdout)
predictions = model.predict(holdout)

In [148]:
df = pd.DataFrame(predictions)
df.to_csv("nathan_lunceford-ice-grain-predictions.csv", index=False, header=["Class"])