In [5]:
# Importing GPU libraries
import cudf as pd
import cupy as np
import cuml

In [6]:
from cuml.naive_bayes import GaussianNB
from cuml.svm import SVC
from cuml.linear_model import LogisticRegression

In [7]:
!ls

# ****Importing the required libraries****

In [8]:

from PIL import Image
#import torch
import matplotlib.pyplot as plt
import csv
#import torch.nn.functional as F
#from torch import nn
import os
import random
import time
%matplotlib inline
from datetime import datetime
from torchvision import datasets, transforms, models

In [9]:
from cuml.model_selection import train_test_split
from tqdm import tqdm

In [10]:
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.datasets import load_files
from keras.utils import np_utils
from keras.utils.vis_utils import plot_model
from keras.layers import Conv2D, MaxPooling2D
from keras.layers import Dropout, Flatten, Dense
from keras.models import Sequential
from sklearn.metrics import confusion_matrix
from keras.preprocessing import image                  
import pickle
import seaborn as sns
import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from keras import Model
from keras.layers import Input, GlobalAveragePooling2D, BatchNormalization, Dropout, Dense
from tensorflow.keras.applications import EfficientNetB3
from keras.callbacks import ModelCheckpoint, EarlyStopping

In [11]:
sample = pd.read_csv('../input/state-farm-distracted-driver-detection/sample_submission.csv') 

In [12]:
sample.head()

In [13]:
train = pd.read_csv('../input/state-farm-distracted-driver-detection/driver_imgs_list.csv')   
train.head(10) 

In [14]:
train.info()

In [15]:
train['img'].unique()
#train['classname'].isnull().sum()

In [16]:
da =train['classname'].unique()
da

In [17]:
train['classname'].value_counts()

# Checking whether dataset is balanced or not

In [18]:
print(train['classname'].value_counts())
pd.DataFrame(train['classname'].value_counts()).to_pandas().plot(kind='bar')

In [19]:
train.describe()

In [20]:
train.shape


# Observation:

 There are total 22424 training samples

 The training dataset is equally balanced to a great extent and hence we do not require any downsampling of the data

In [21]:
# my goal is to predict the likelihood of what the driver is doing in each picture. 

# The 10 classes to predict are:

#  c0: safe driving
#  c1: texting - right
#  c2: talking on the phone - right
#  c3: texting - left
#  c4: talking on the phone - left
#  c5: operating the radio
#  c6: drinking
#  c7: reaching behind
#  c8: hair and makeup
#  c9: talking to passenger

# Converting into 64*64 images

We can also substitute 64,64 to 224,224 for better results only if ram is > 32 GB

In [22]:
train_img = []
for i in tqdm(range(train.shape[0])):
    img = image.load_img('../input/state-farm-distracted-driver-detection/imgs/train/'+train["classname"][i]+"/"+train["img"][i],target_size=(64,64,3))
    img = image.img_to_array(img).flatten() #flattens a matrix to one dimension 
    img = img/255 #dividing by 255 will convert it to range from 0 to 1 
    train_img.append(img)
X = np.array(train_img)

# Encodes the object as an enumerated type or categorical variable.

In [23]:
fact = pd.factorize(train['classname'])
# This method is useful for obtaining a numeric representation of an array
# y = fact[0]
# print(y)
rep = fact[1]
y = fact[0]
print(y)
print(rep)

In [24]:
X.shape

In [25]:
y.shape

# Splitting into Train and Test sets

In [64]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, test_size=0.1)

# Support Vector Classifier

Taking a lot of memory accuracy is 0.9924

In [27]:
# clf_svc = SVC(probability=True)
# clf_svc.fit(X_train, y_train)

In [28]:
# preds_prob= clf_svc.predict_proba(X_test) # predicting probability
# preds_prob[0]

# preds_prob[1]

# preds= clf_svc.predict(X_test)
# cu_score = cuml.metrics.accuracy_score( y_test, preds )
# print(cu_score)

# Logistic Regression

In [29]:
clf_lr = LogisticRegression()
clf_lr.fit(X_train, y_train)
pred= clf_lr.predict(X_test)
cu_score = cuml.metrics.accuracy_score( y_test, pred )
print(cu_score)

# Calculating Confusion Matrix

In [65]:
from cuml.metrics import confusion_matrix
cm=confusion_matrix(y_test.astype("int32"),pred.astype("int32"))

In [66]:
sns.heatmap(np.asnumpy(cm),annot=True, cmap='CMRmap',fmt='g')

In [67]:
pred_prob= clf_lr.predict_proba(X_test)

In [68]:
pred_prob[0]

# XGBoost 
didn't work because

Free memory: 1260126208

Requested memory: 1983971328


In [34]:
# import xgboost as xgb
# xgb_clf = xgb.XGBClassifier(use_label_encoder=False,tree_method='gpu_hist')
# xgb_clf.fit(X_train, y_train)
# pred_prob_xgb=xgb_clf.predict_proba(X_test)
# pred= xgb_clf.predict(X_test)
# cu_score = cuml.metrics.accuracy_score( y_test, pred)
# print(cu_score)


# Lightgbm

Message came your notebook tried to take more memory than allocated

In [35]:
# import lightgbm as lgb
# lgb_clf = lgb.LGBMClassifier(boosting_type='dart',learning_rate=0.18, max_depth=7,
#                n_estimators=450,objective='binary',device='gpu',
#                random_state=42)
# lgb_clf.fit(np.asnumpy(X_train),np.asnumpy(y_train))
# pred_prob_lgb=lgb_clf.predict_proba(np.asnumpy(X_test))
# pred= lgb_clf.predict(np.asnumpy(X_test))
# cu_score = cuml.metrics.accuracy_score( y_test, pred )
# print(cu_score)
# accuracy is 0.9977699

# Random Forest Classifier

In [36]:
# from cuml.ensemble import RandomForestClassifier
# rdf_clf=RandomForestClassifier(n_estimators=600,random_state=42, verbose=0,warm_start=False)
# rdf_clf.fit(X_train, y_train)
# preds_prob_rdf=rdf_clf.predict_proba(X_test)
# preds= rdf_clf.predict(X_test)
# cu_score = cuml.metrics.accuracy_score( y_test, preds )
# print(cu_score) accuracy is 0.97993

In [69]:
test = pd.read_csv('../input/state-farm-distracted-driver-detection/sample_submission.csv')   
test.head() 

In [72]:
y

In [92]:
train.head()

In [94]:
train.to_csv("submission.csv",index=False)