In [3]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from IPython.display import display
from sklearn import metrics 
import math
from pandas.api.types import is_string_dtype
from pandas.api.types import is_numeric_dtype

In [4]:
df=pd.read_csv("Train.csv",low_memory=False)
df


Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,Train_0,0,0,0,1
1,Train_1,0,1,0,0
2,Train_2,1,0,0,0
3,Train_3,0,0,1,0
4,Train_4,1,0,0,0
...,...,...,...,...,...
1816,Train_1816,0,0,0,1
1817,Train_1817,1,0,0,0
1818,Train_1818,1,0,0,0
1819,Train_1819,0,0,1,0


#### İlk olarak veri setimiz de NaN değerler var mı onu bulalım daha sonra hangi değerler ile analiz yapacağımıza kadar verelim

In [5]:
df.isnull().sum()

image_id             0
healthy              0
multiple_diseases    0
rust                 0
scab                 0
dtype: int64

### Hiç Nan değerimiz yok o yüzden fix_missing kullanmamıza gerek yok
### Fakat String değerlerimiz var bunları kategorik değişken olarak değiştirip numeric değerlere çevirmemiz gerekiyor

In [6]:
def train_cats(df):
    for n,c in df.items():
        if is_string_dtype(c):
            df[n]=c.astype("category").cat.as_ordered()

In [7]:
def apply_cats(df,train):
    for n,c in df.items():
        if train[n].dtype=="category":
            df[n]=pd.Categorical(c,categories=tran[n].categories,ordered=True)

In [8]:
train_cats(df)

In [9]:
df["image_id"]

0          Train_0
1          Train_1
2          Train_2
3          Train_3
4          Train_4
           ...    
1816    Train_1816
1817    Train_1817
1818    Train_1818
1819    Train_1819
1820    Train_1820
Name: image_id, Length: 1821, dtype: category
Categories (1821, object): ['Train_0' < 'Train_1' < 'Train_10' < 'Train_100' ... 'Train_996' < 'Train_997' < 'Train_998' < 'Train_999']

In [10]:
df["image_id"].cat.codes

0          0
1          1
2        933
3       1044
4       1155
        ... 
1816     909
1817     910
1818     911
1819     912
1820     914
Length: 1821, dtype: int16

In [11]:
def numericalize(df,col,name):
    if not is_numeric_dtype(col):
        df[name]=col.cat.codes+1

In [12]:
numericalize(df,df["image_id"],"image_id")

In [13]:
df["image_id"]

0          1
1          2
2        934
3       1045
4       1156
        ... 
1816     910
1817     911
1818     912
1819     913
1820     915
Name: image_id, Length: 1821, dtype: int16

In [14]:
df=df.sort_values(by="image_id")

In [15]:
df

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,1,0,0,0,1
1,2,0,1,0,0
10,3,0,0,1,0
100,4,1,0,0,0
1000,5,0,0,1,0
...,...,...,...,...,...
995,1817,0,0,1,0
996,1818,0,0,1,0
997,1819,0,0,0,1
998,1820,0,0,0,1


In [16]:
def fix_missing(df,col,name,nan_dict,is_train):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum():
            df[name+"_na"]=pd.isnull(col)
            nan_dict[name]=col.median()
            df[name]=col.fillna(nan_dict[name])
    else:
        if is_numeric_dtype(col):
            if name in nan_dict:
                df[name+"_na"]=pd.isnull(col)
                df[name]=col.fillna(nan_dict[name])
            else:
                df[name]=col.fillna(df[name].median())

In [17]:
def numericalize(df,col,name):
    if not is_numeric_dtype(col):
        df[name]=col.cat.codes+1

In [18]:
def proc_df(df,y_fld,nan_dict=None,is_train=True):
    df=df.copy()
    y=df[y_fld].values
    
    df.drop(y_fld,axis=1,inplace=True)
    
    if nan_dict is None:
        nan_dict={}
    for n,c in df.items():
        fix_missing(df,c,n,nan_dict,is_train)
        numericalize(df,c,n)
    if is_train:
        return df,y,nan_dict
    return df,y

In [19]:
def split_train_val(df,n):
    return df[:n].copy(),df[n:].copy()

In [20]:
n_valid=100
n_train=len(df)-n_valid
raw_train,raw_valid=split_train_val(df,n_train)

In [21]:
x_train,y_train,nas=proc_df(raw_train,'healthy')
x_valid,y_valid=proc_df(raw_valid,'healthy',nan_dict=nas,is_train=False)

In [22]:
m=RandomForestRegressor(n_estimators=1000,n_jobs=-1)
m.fit(x_train,y_train)
m.score(x_train,y_train)

1.0

In [23]:
def rmse(x,y):
    return math.sqrt(((x-y)**2).mean())

In [24]:
def print_score(m):
    print(f"RMSE of valid set {rmse(m.predict(x_valid),y_valid)}")
    print(f"R^2 of valid set {m.score(x_valid,y_valid)}")
    print(f"RMSE of train set {rmse(m.predict(x_train),y_train)}")
    print(f"R^2 of train set {m.score(x_train,y_train)}")

In [25]:
print_score(m)

RMSE of valid set 0.0
R^2 of valid set 1.0
RMSE of train set 0.0
R^2 of train set 1.0


In [26]:
x_train

Unnamed: 0,image_id,multiple_diseases,rust,scab
0,1,0,0,1
1,2,1,0,0
10,3,0,1,0
100,4,0,0,0
1000,5,0,1,0
...,...,...,...,...
904,1717,0,0,1
905,1718,1,0,0
906,1719,0,0,1
907,1720,0,0,0


In [27]:
y_train

array([0, 0, 0, ..., 0, 1, 1], dtype=int64)

In [28]:
x_valid

Unnamed: 0,image_id,multiple_diseases,rust,scab
909,1722,0,0,0
91,1723,0,0,1
910,1724,0,1,0
911,1725,0,1,0
912,1726,0,1,0
...,...,...,...,...
995,1817,0,1,0
996,1818,0,1,0
997,1819,0,0,1
998,1820,0,0,1


In [29]:
y_valid

array([1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0,
       1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], dtype=int64)

## R^2 olarak verimize yaklaştık şimdi RMSE yaklaşımına bakalım 

In [30]:
df

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,1,0,0,0,1
1,2,0,1,0,0
10,3,0,0,1,0
100,4,1,0,0,0
1000,5,0,0,1,0
...,...,...,...,...,...
995,1817,0,0,1,0
996,1818,0,0,1,0
997,1819,0,0,0,1
998,1820,0,0,0,1


In [31]:
df["healthy"].mean()

0.2833607907742998

In [32]:
df["healthy"]-df["healthy"].mean()

0      -0.283361
1      -0.283361
10     -0.283361
100     0.716639
1000   -0.283361
          ...   
995    -0.283361
996    -0.283361
997    -0.283361
998    -0.283361
999    -0.283361
Name: healthy, Length: 1821, dtype: float64

In [33]:
math.sqrt(((df.healthy-df.healthy.mean()).pow(2).sum())/len(df))

0.4506300622751031

In [34]:
filt=(df.rust>0)
df2=df[filt]
df3=df[~filt]

In [35]:
df2

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
10,3,0,0,1,0
1000,5,0,0,1,0
1011,17,0,0,1,0
1019,25,0,0,1,0
102,26,0,0,1,0
...,...,...,...,...,...
985,1806,0,0,1,0
987,1808,0,0,1,0
995,1817,0,0,1,0
996,1818,0,0,1,0


In [36]:
math.sqrt(((df2.healthy-df2.healthy.mean()).pow(2).sum()+(df3.healthy-df3.healthy.mean()).pow(2).sum())/len(df))

0.40176364748369797

In [37]:
filt2=(df3.scab>0)
df4=df3[filt2]
df5=df3[~filt2]

In [38]:
df4

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
0,1,0,0,0,1
1003,8,0,0,0,1
1006,11,0,0,0,1
1008,13,0,0,0,1
1009,14,0,0,0,1
...,...,...,...,...,...
992,1814,0,0,0,1
993,1815,0,0,0,1
994,1816,0,0,0,1
997,1819,0,0,0,1


In [39]:
df5

Unnamed: 0,image_id,healthy,multiple_diseases,rust,scab
1,2,0,1,0,0
100,4,1,0,0,0
1001,6,1,0,0,0
1002,7,1,0,0,0
1004,9,1,0,0,0
...,...,...,...,...,...
980,1801,1,0,0,0
981,1802,1,0,0,0
983,1804,1,0,0,0
989,1810,1,0,0,0


In [41]:
math.sqrt(((df2.healthy-df2.healthy.mean()).pow(2).sum()+(df4.healthy-df4.healthy.mean()).pow(2).sum()+(df5.healthy-df5.healthy.mean()).pow(2).sum())/len(df))

0.20610865466930361