In [43]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [44]:
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.neighbors import LocalOutlierFactor
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from scipy import stats

class Model:

    def preprocess(self, train, test):
        # fill missing values with mean
        train = train.fillna(train.mean())
        test = test.fillna(test.mean())
        # remove columns with all zeros and store the column names
        zero_cols = train.columns[(train == 0).all()]
        train = train.loc[:, (train != 0).any(axis=0)]
        # remove zero_cols from test
        test = test.drop(zero_cols, axis=1)

        return pd.DataFrame(train), pd.DataFrame(test)

    def pca(self, train, test, components=661):

        train = train.to_numpy()
        test = test.to_numpy()

        # perform pca on train
        pca = PCA(n_components=components)
        pca.fit(train)
        return pd.DataFrame(pca.transform(train)), pd.DataFrame(pca.transform(test))

    def lda(self, train, test, target, components=19):

        train = train.to_numpy()
        test = test.to_numpy()
        target = target.to_numpy()
        # perform lda on train
        # use label encoder to encode target
        # target = self.label_encoder.fit_transform(target)
        lda = LDA(n_components=components)
        lda.fit(train, target)
        return pd.DataFrame(lda.transform(train)), pd.DataFrame(lda.transform(test))

    def outLier(self, data, target):
        # Use lof to remove outliers
        data = data.to_numpy()
        lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
        y_pred = lof.fit_predict(data)
        data = data[y_pred == 1]
        # remove same rows from target which are removed from data
        target = target[y_pred == 1]
        return pd.DataFrame(data), pd.DataFrame(target)

    def cluster_addLabels(self, train, test):
        # Use kmeans to cluster data

        train = train.to_numpy()
        test = test.to_numpy()


        kmeans = KMeans(n_clusters=2, random_state=0, max_iter=100000)
        kmeans.fit(train)

        # add labels to train and test
        train = pd.DataFrame(train)
        train[5001] = kmeans.labels_

        test = pd.DataFrame(test)
        test[5001] = kmeans.predict(test)
        
        # _________________________________________________________

        train = train.to_numpy()
        test = test.to_numpy()

        kmeans = KMeans(n_clusters=8, random_state=0, max_iter=100000)
        kmeans.fit(train)

        train = pd.DataFrame(train)
        train[5002] = kmeans.labels_

        test = pd.DataFrame(test)
        test[5002] = kmeans.predict(test)

        # # _________________________________________________________

        train = train.to_numpy()
        test = test.to_numpy()

        kmeans = KMeans(n_clusters=18, random_state=0, max_iter=100000)
        kmeans.fit(train)

        train = pd.DataFrame(train)
        train[5003] = kmeans.labels_

        test = pd.DataFrame(test)
        test[5003] = kmeans.predict(test)

        return pd.DataFrame(train), pd.DataFrame(test)
    
    def fit_predict(self, train, target, test):
        # use logistic regression to fit and predict use one vs one method

        train = train.to_numpy()
        target = target.to_numpy()
        test = test.to_numpy()

        # use random forest to fit and predict
        rf = RandomForestClassifier(n_estimators=10000, max_depth=10, random_state=0, criterion='entropy')
        rf.fit(train, target)
        pred_rf = rf.predict(test)

        # use logistic regression to fit and predict
        lr = LogisticRegression(random_state=0, max_iter=int(1e8), tol=1e-8, solver='saga', multi_class='multinomial')
        lr.fit(train, target)
        pred_lr = lr.predict(test)
        pred_prob = lr.predict_proba(test)

        pred = []
        prob = []

        # select best 2 probabilities 
        for i in range(len(test)):
            if sorted(pred_prob[i])[-1] - sorted(pred_prob[i])[-2] < 0.2:
                pred.append(pred_rf[i])        
                prob.append([i, sorted(pred_prob[i])[-1], sorted(pred_prob[i])[-2]])
            else:
                pred.append(pred_lr[i])
        

        return pd.DataFrame(pred), pd.DataFrame(prob)



In [45]:


# import silhoutte score
from sklearn.metrics import silhouette_score

# read test.csv
train = pd.read_csv('train.csv')
target = train['category']
train.drop(columns=['ID', 'category'], inplace=True)
train.head()

model = Model()

# split train into train and test
from sklearn.model_selection import train_test_split
train, test, target, target_test = train_test_split(train, target, test_size=0.2, random_state=0)

# preprocess train and test
train, test = model.preprocess(train, test)
train, test = model.cluster_addLabels(train, test)
train, test = model.pca(train, test)
train, test = model.lda(train, test, target)
train, target = model.outLier(train, target)



# fit and predict
pred, prob = model.fit_predict(train, target, test)

# calculate the accuracy
from sklearn.metrics import accuracy_score
print(accuracy_score(target_test, pred))
# print(scores)

  rf.fit(train, target)
  y = column_or_1d(y, warn=True)


0.8073770491803278


In [46]:
prob

Unnamed: 0,0,1,2
0,3,0.529971,0.367522
1,14,0.47856,0.284735
2,15,0.269585,0.256209
3,32,0.485238,0.349929
4,34,0.223101,0.17988
5,37,0.199786,0.199421
6,54,0.379669,0.356991
7,55,0.236559,0.217959
8,70,0.386981,0.302257
9,72,0.523528,0.46748


In [47]:
train = pd.read_csv('train.csv')
target = train['category']
train.drop(columns=['ID', 'category'], inplace=True)

test = pd.read_csv('test.csv')
ID = test['ID']
test.drop(columns=['ID'], inplace=True)

model = Model()
train, test = model.preprocess(train, test)
train, test = model.cluster_addLabels(train, test)
train, test = model.pca(train, test)
train, test = model.lda(train, test, target)
train, target = model.outLier(train, target)


# y_pred = model.fit_predict(train, target, test

y_pred, prob = model.fit_predict(train, target, test)
y_pred

  rf.fit(train, target)
  y = column_or_1d(y, warn=True)


Unnamed: 0,0
0,Leeche_Raw
1,Orange_Raw
2,Pomengranate_Ripe
3,Strawberry_Raw
4,Orange_Ripe
...,...
410,Orange_Ripe
411,Mango_Ripe
412,Banana_Ripe
413,Orange_Raw


In [48]:
prob

Unnamed: 0,0,1,2
0,0,0.579331,0.407831
1,6,0.406042,0.25088
2,32,0.27014,0.215615
3,34,0.518189,0.376894
4,65,0.317616,0.275824
5,71,0.533069,0.459819
6,85,0.38874,0.218618
7,93,0.374609,0.33682
8,103,0.428443,0.3789
9,117,0.344794,0.236523


In [49]:
# write ID and y_pred to csv
# create two coloms ID, Category in a csv
submission = pd.DataFrame({'ID': ID, 'Category': y_pred[0]})
submission.to_csv('test4.csv', index=False)


In [50]:
y_pred.head()

Unnamed: 0,0
0,Leeche_Raw
1,Orange_Raw
2,Pomengranate_Ripe
3,Strawberry_Raw
4,Orange_Ripe


In [51]:
categories = y_pred[0]
categories

0             Leeche_Raw
1             Orange_Raw
2      Pomengranate_Ripe
3         Strawberry_Raw
4            Orange_Ripe
             ...        
410          Orange_Ripe
411           Mango_Ripe
412          Banana_Ripe
413           Orange_Raw
414       Strawberry_Raw
Name: 0, Length: 415, dtype: object