In [None]:
!pip install pycaret

from pycaret.classification import *

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
#sns.set_theme(style="darkgrid")
import pandas as pd

from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import LabelEncoder


df = pd.read_csv('seeds.csv')
#df = pd.read_csv('HepatitisCdata.csv')

################ clean your data


'''
#browse part of your data
print(df.head())

#Display data columns and types
df.columns
df.dtypes

# drop some columns 

df.drop(['Unnamed: 0'],axis=1,inplace=True)
df.dtypes

# Check unique values of a certain column
df['Category'].unique()

# Manage missing values
for col in df.columns:
    pct_missing = np.mean(df[col].isnull())
    print('{} - {}%'.format(col, round(pct_missing*100,3)))

#delete the rows which are completely null 
df.dropna(how='all',axis=0,inplace=True) 

#fill the missing values using the mean of the column 
pd.isnull(df).sum()
df.fillna(df.mean(), inplace=True)


#count the unique values

df['Category'][df['Category']=='0=Blood Donor'].value_counts()
df['Category'][df['Category']=='3=Cirrhosis'].value_counts()


## Display duplicte rows
print("Number of duplicated rows is: ", df.duplicated().sum())

### delete duplicate rows
df.drop_duplicates(inplace=True)

# Some basic information about each column in the DataFrame 
df.info()
df.describe()



#Convert nominal data to numeric data
df['Category'].unique()
labels, levels = pd.factorize(df['Category'])
df['Category']=labels

labels, levels = pd.factorize(df['Sex'])
df['Sex']=labels

for i in range(0, len(df.columns)):
    df.iloc[:,i] = pd.to_numeric(df.iloc[:,i], errors='ignore')


'''
#################################################################


from scipy import stats

def drop_numerical_outliers(df, z_thresh=3):
    # Constrains will contain `True` or `False` depending on if it is a value below the threshold.
    constrains = df.select_dtypes(include=[np.number]) \
        .apply(lambda x: np.abs(stats.zscore(x)) < z_thresh) \
        .all(axis=1)
    # Drop (inplace) values set to be rejected
    df.drop(df.index[~constrains], inplace=True)
    
drop_numerical_outliers(df)   

############### analysis your data



############# Apply machine learning classifiers

# training and validation data
data = df.sample(frac=0.8, random_state=786)

# test data
data_unseen = df.drop(data.index)

data.reset_index(drop=True, inplace=True)
data_unseen.reset_index(drop=True, inplace=True)

data['class'][data['class']==1] = 0
data['class'][data['class']==2] = 1
data['class'][data['class']==3] = 2

data_unseen['class'][data_unseen['class']==1] = 0
data_unseen['class'][data_unseen['class']==2] = 1
data_unseen['class'][data_unseen['class']==3] = 2


print('Data for Modeling: ' + str(data.shape))
print('Unseen Data For Predictions: ' + str(data_unseen.shape))

## Display duplicte rows
print("Number of duplicated rows is: ", data.duplicated().sum())

### delete duplicate rows
data.drop_duplicates(inplace=True)

'''
# Managing the imbalanced data
X, y = data.loc[:, 'AR':'LG'], data.loc[:, 'class']
# label encode the target variable
y = LabelEncoder().fit_transform(y)
# transform the dataset
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
data= X
data['class'] = y
'''


clf = setup(data=data, target='class', train_size=0.7, session_id=123)
############ training step
best_model = compare_models()

## use a certain model
rf = create_model('rf')

# The model prediction
#pp  = predict_model(rf)

#check_metric(pp['class'], pp['Label'], metric = 'Accuracy')


# predict labels for unseen data


unseen_predictions = predict_model(rf, data=data_unseen)
unseen_predictions.head()

#Apply the performance metrics on test (unseen) data

from pycaret.utils import check_metric

check_metric(unseen_predictions['class'], unseen_predictions['Label'], metric = 'F1')


# Tuning the model

tuned_rf = tune_model(rf)

print(rf)
print(tuned_rf)
# Tunned model prediction

unseen_predictions1 = predict_model(tuned_rf, data=data_unseen)
unseen_predictions1.head()

check_metric(unseen_predictions1['class'], unseen_predictions1['Label'], metric = 'Accuracy')




0.881

In [None]:
r = create_model('ridge')

Unnamed: 0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,1.0,0.0,1.0,1.0,1.0,1.0,1.0
1,1.0,0.0,1.0,1.0,1.0,1.0,1.0
2,0.9231,0.0,0.9167,0.9385,0.9219,0.8839,0.8919
3,1.0,0.0,1.0,1.0,1.0,1.0,1.0
4,1.0,0.0,1.0,1.0,1.0,1.0,1.0
5,0.9231,0.0,0.9333,0.9385,0.9231,0.885,0.8929
6,1.0,0.0,1.0,1.0,1.0,1.0,1.0
7,0.8462,0.0,0.85,0.8615,0.8462,0.7699,0.7768
8,0.9231,0.0,0.9167,0.9385,0.9219,0.8839,0.8919
9,1.0,0.0,1.0,1.0,1.0,1.0,1.0
