In [136]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
import numpy as np

pd.set_option("display.max_columns", None)

df = pd.read_csv("./grain-training.csv")

In [137]:
df

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class
0,11366,423.114990,171.906647,85.579300,0.867278,11599,0.611404,Osmancik
1,16523,531.892029,224.995422,94.417702,0.907689,16911,0.577041,Cammeo
2,11088,418.208008,172.027420,82.935669,0.876112,11284,0.624993,Osmancik
3,14528,475.447998,192.198563,97.417427,0.862029,14795,0.629490,Cammeo
4,8990,389.377014,157.749603,73.919182,0.883418,9297,0.625261,Osmancik
...,...,...,...,...,...,...,...,...
3043,14078,478.470001,203.645462,88.560310,0.900491,14280,0.744395,Cammeo
3044,17246,540.541992,225.988861,98.573151,0.899857,17704,0.573929,Cammeo
3045,11070,419.403015,173.575043,82.154213,0.880898,11266,0.600586,Osmancik
3046,11747,452.127014,194.494858,78.744461,0.914376,11935,0.542637,Cammeo


In [138]:
def preprocess(df):
    # Remove duplicates
    df = df.drop_duplicates()

    # Handle missing values - Using median for numerical features
    imputer = SimpleImputer(strategy="median")
    df_numeric = df.select_dtypes(include=[np.number])
    df[df_numeric.columns] = imputer.fit_transform(df_numeric)

    # Creating interaction terms - Example with MajorAxisLength and MinorAxisLength
    df["Area_Product"] = df["Major_Axis_Length"] * df["Minor_Axis_Length"]


    return df

df = preprocess(df)

In [139]:
df

Unnamed: 0,Area,Perimeter,Major_Axis_Length,Minor_Axis_Length,Eccentricity,Convex_Area,Extent,Class,Area_Product
0,-0.745353,-0.869621,-0.963718,-0.118621,-0.939649,-0.755098,-0.668661,Osmancik,-0.723942
1,2.247170,2.202095,2.094802,1.422411,1.000794,2.250292,-1.113363,Cammeo,2.223597
2,-0.906672,-1.008188,-0.956760,-0.579555,-0.515454,-0.933317,-0.492800,Osmancik,-0.924497
3,1.089504,0.608192,0.205328,1.945432,-1.191667,1.053115,-0.434602,Cammeo,1.086412
4,-2.124108,-1.822336,-1.779326,-2.151638,-0.164641,-2.057509,-0.489333,Osmancik,-2.100682
...,...,...,...,...,...,...,...,...,...
3043,0.828376,0.693529,0.864800,0.401137,0.655144,0.761741,1.052433,Cammeo,0.775687
3044,2.666716,2.446358,2.152035,2.146940,0.624686,2.698950,-1.153637,Cammeo,2.689688
3045,-0.917117,-0.974442,-0.867600,-0.715807,-0.285663,-0.943500,-0.808662,Osmancik,-0.927786
3046,-0.524265,-0.050361,0.337621,-1.310319,1.321858,-0.564998,-1.558607,Cammeo,-0.451501


In [140]:
X = df.drop("Class", axis=1)
y = df[["Class"]]

ro = RandomOverSampler()
X_new, y_new = ro.fit_resample(X, y)


X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=.30, random_state=10
)

model = LogisticRegression(random_state=10)


model.fit(X_train, y_train.values.ravel())

predictions = model.predict(X_test)


score = accuracy_score(y_test, predictions)

score

0.9224043715846995

In [141]:
holdout = pd.read_csv("./grain-holdout.csv")

holdout = preprocess(holdout)
predictions = model.predict(holdout)

ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- Area_Product


In [None]:
df = pd.DataFrame(predictions)
df.to_csv("nathan_lunceford-ice-grain-predictions.csv", index=False, header=["Class"])