<a href="https://colab.research.google.com/github/mltrev23/tech-test/blob/main/1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Setup Environment

In [None]:
pip install scikit-learn xgboost pandas numpy matplotlib seaborn kaggle requests



Download csv from kaggle

In [None]:
import subprocess
import pandas as pd

# Define the Kaggle dataset URL
kaggle_url = "mssmartypants/rice-type-classification"

# Define the filename you want to download
file_name = "riceClassification.csv"

# Use Kaggle API to download the file
subprocess.run(['kaggle', 'datasets', 'download', '-d', kaggle_url, '-f', file_name])

CompletedProcess(args=['kaggle', 'datasets', 'download', '-d', 'mssmartypants/rice-type-classification', '-f', 'riceClassification.csv'], returncode=0)

Unzip riceClassification.zip

In [None]:
!unzip riceClassification.csv.zip

Archive:  riceClassification.csv.zip
  inflating: riceClassification.csv  


Data loading

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

data = pd.read_csv('riceClassification.csv')

# drop id field
data = data.drop(columns = ['id'])

data.info()
data.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18185 entries, 0 to 18184
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Area             18185 non-null  int64  
 1   MajorAxisLength  18185 non-null  float64
 2   MinorAxisLength  18185 non-null  float64
 3   Eccentricity     18185 non-null  float64
 4   ConvexArea       18185 non-null  int64  
 5   EquivDiameter    18185 non-null  float64
 6   Extent           18185 non-null  float64
 7   Perimeter        18185 non-null  float64
 8   Roundness        18185 non-null  float64
 9   AspectRation     18185 non-null  float64
 10  Class            18185 non-null  int64  
dtypes: float64(8), int64(3)
memory usage: 1.5 MB


Unnamed: 0,Area,MajorAxisLength,MinorAxisLength,Eccentricity,ConvexArea,EquivDiameter,Extent,Perimeter,Roundness,AspectRation,Class
count,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0,18185.0
mean,7036.492989,151.680754,59.807851,0.915406,7225.817872,94.132952,0.616653,351.606949,0.707998,2.599081,0.549079
std,1467.19715,12.376402,10.061653,0.030575,1502.006571,9.90625,0.104389,29.50062,0.06731,0.434836,0.497599
min,2522.0,74.133114,34.409894,0.676647,2579.0,56.666658,0.383239,197.015,0.17459,1.358128,0.0
25%,5962.0,145.67591,51.393151,0.891617,6125.0,87.126656,0.53853,333.99,0.650962,2.208527,0.0
50%,6660.0,153.88375,55.724288,0.923259,6843.0,92.085696,0.601194,353.088,0.701941,2.602966,1.0
75%,8423.0,160.056214,70.156593,0.941372,8645.0,103.559146,0.695664,373.003,0.76928,2.964101,1.0
max,10210.0,183.211434,82.550762,0.966774,11008.0,114.016559,0.886573,508.511,0.904748,3.911845,1.0


Data preprocessing

In [10]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Example: Scaling features
scaler = StandardScaler()
data_scaled = pd.DataFrame(scaler.fit_transform(data.iloc[:, :-1]), columns=data.columns[:-1])


Split train data and test data

In [11]:
X = data_scaled
y = data['Class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

Defining the model

In [12]:
from xgboost import XGBClassifier

model = XGBClassifier()

param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0]
}

Hyperparameter Tuning

In [13]:
from sklearn.model_selection import RandomizedSearchCV

random_search = RandomizedSearchCV(estimator=model, param_distributions=param_grid, n_iter=20, cv=5, scoring='accuracy', n_jobs=-1, verbose=2, random_state=42)
random_search.fit(X_train, y_train)

print(f'Best parameters found: {random_search.best_params_}')

Fitting 5 folds for each of 20 candidates, totalling 100 fits
Best parameters found: {'subsample': 1.0, 'n_estimators': 100, 'max_depth': 3, 'learning_rate': 0.1, 'colsample_bytree': 0.8}


Model training

In [14]:
best_model = random_search.best_estimator_  # or random_search.best_estimator_

best_model.fit(X_train, y_train)

Model evaluation

In [15]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

y_pred = best_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

print("Classification Report:")
print(classification_report(y_test, y_pred))

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))


Accuracy: 0.9898267803134452
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      1632
           1       0.99      0.99      0.99      2005

    accuracy                           0.99      3637
   macro avg       0.99      0.99      0.99      3637
weighted avg       0.99      0.99      0.99      3637

Confusion Matrix:
[[1611   21]
 [  16 1989]]


Save the model

In [16]:
import joblib

# Save the model
joblib.dump(best_model, 'xgboost_rice_model.pkl')

['xgboost_rice_model.pkl']

# Deploy model

In [51]:
pip install fastapi uvicorn pyngrok nest_asyncio

Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0


create fastapi app

In [24]:
from fastapi import FastAPI
import joblib
import pandas as pd
from pydantic import BaseModel

# Initialize FastAPI app
app = FastAPI()

# Load the trained model
model = joblib.load('xgboost_rice_model.pkl')

# Define the input data model
class RiceData(BaseModel):
    Area: float
    Perimeter: float
    MajorAxisLength: float
    MinorAxisLength: float
    AspectRation: float
    Eccentricity: float
    ConvexArea: float
    EquivDiameter: float
    Extent: float
    Solidity: float
    Roundness: float
    Compactness: float
    ShapeFactor1: float
    ShapeFactor2: float
    ShapeFactor3: float
    ShapeFactor4: float

# Define the prediction route
@app.post('/predict')
def predict_rice_type(data: RiceData):
    # Convert input data to DataFrame
    input_data = pd.DataFrame([data.dict().values()], columns=data.dict().keys())

    # Reorder columns to match the model's training order
    training_features = ['Area', 'MajorAxisLength', 'MinorAxisLength', 'Eccentricity',
                         'ConvexArea', 'EquivDiameter', 'Extent', 'Perimeter',
                         'Roundness', 'AspectRation']

    input_data = input_data[training_features]

    # Make prediction
    prediction = model.predict(input_data)

    return {'prediction': prediction[0]}

In [58]:
!ngrok authtoken 2laQP6bVYRgAXRWonIEL3VdYIfQ_29SVCfHbGCRJxAieHco41

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


Run app

In [25]:
import uvicorn
import nest_asyncio
from pyngrok import ngrok

# Apply the nest_asyncio patch
nest_asyncio.apply()

public_url = ngrok.connect(9001, "http")
print('Public URL:', public_url)

uvicorn.run(app, host='0.0.0.0', port=9001)


Public URL: NgrokTunnel: "https://0f63-35-247-40-7.ngrok-free.app" -> "http://localhost:9001"


INFO:     Started server process [19502]
INFO:     Waiting for application startup.
INFO:     Application startup complete.
INFO:     Uvicorn running on http://0.0.0.0:9001 (Press CTRL+C to quit)
INFO:     Shutting down
INFO:     Waiting for application shutdown.
INFO:     Application shutdown complete.
INFO:     Finished server process [19502]
