In [10]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [11]:
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [12]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [13]:
df_video=pd.read_csv("train.csv")
df_video.head()

Unnamed: 0,video_id,category_id,description
0,9WFf9XY5rME,24.0,CLICK TO SUBSCRIBE TO THE YOUTUBERS IN THIS EP...
1,REbjfHF0N0s,10.0,Jia & Jackson in the #MOOD\n\nProduced by The ...
2,hDEc4ImIVHk,25.0,Speaking at the March for Our Lives event in W...
3,JyUKqUTp9rc,26.0,HEY EVERYONE! Today I'm testing out the brand ...
4,leu-cTvMWTA,10.0,EXO's Winter Special Album Universe has been r...


In [14]:
df_video.shape

(5205, 3)

In [15]:
df_video.dtypes

video_id        object
category_id    float64
description     object
dtype: object

In [16]:
df_video.isnull().sum()

video_id       0
category_id    0
description    0
dtype: int64

In [17]:
df_video.category_id.value_counts()

24.0    1272
10.0     781
26.0     521
23.0     438
25.0     375
22.0     373
17.0     305
28.0     299
1.0      269
27.0     229
15.0     132
2.0       73
20.0      73
19.0      57
29.0       5
43.0       3
Name: category_id, dtype: int64

In [18]:
#preprocessing of train data

In [19]:
import re
df_video['description'] = df_video['description'].apply(lambda x: re.sub('[^a-zA-Z\s]','',x))

In [20]:
df_video['description'] = df_video['description'].apply(lambda x: x.lower())

In [21]:
t = re.compile(".*'t")
print(list(filter(t.match, stop_words)))

["don't", "aren't", "couldn't", "didn't", "doesn't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "needn't", "shan't", "shouldn't", "wasn't", "weren't", "won't", "wouldn't"]


In [22]:
update_words = list(filter(t.match, stop_words))

In [23]:
stopwrds = set([i for i in stop_words  if i not in update_words])

In [24]:
from nltk.stem.wordnet import WordNetLemmatizer
lemma = WordNetLemmatizer()
corpus = []

for i in df_video['description']:
    tokens = i.split()
    clean_token = [lemma.lemmatize(i) for i in tokens if i not in stopwrds]
    corpus.append(' '.join(clean_token))
    
df_video['preprocess_description'] = corpus

In [25]:
df_video.head()

Unnamed: 0,video_id,category_id,description,preprocess_description
0,9WFf9XY5rME,24.0,click to subscribe to the youtubers in this ep...,click subscribe youtubers episode httpsgooglez...
1,REbjfHF0N0s,10.0,jia jackson in the moodnnproduced by the part...,jia jackson moodnnproduced partysquad alvaro r...
2,hDEc4ImIVHk,25.0,speaking at the march for our lives event in w...,speaking march life event washington marjory s...
3,JyUKqUTp9rc,26.0,hey everyone today im testing out the brand ne...,hey everyone today im testing brand new collec...
4,leu-cTvMWTA,10.0,exos winter special album universe has been re...,exos winter special album universe releasednli...


In [26]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

vocabsize = 19000 
sentence_length = 475

tokenizer = Tokenizer(num_words=vocabsize, split=' ')
tokenizer.fit_on_texts(df_video['preprocess_description'].values)
    
X = tokenizer.texts_to_sequences(df_video['preprocess_description'].values)
X = pad_sequences(X,maxlen=sentence_length)

In [27]:
X[:5]

array([[    0,     0,     0, ...,   298,  5832,  9528],
       [    0,     0,     0, ...,  1258,     1,   407],
       [    0,     0,     0, ...,   888,   574,   508],
       [    0,     0,     0, ...,  5356, 14504,  5834],
       [    0,     0,     0, ...,   712,  2073,  3550]], dtype=int32)

In [28]:
y=df_video['category_id']

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=10)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4164, 475), (1041, 475), (4164,), (1041,))

In [30]:
#Using K Nearest Neighbor

In [31]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()

In [32]:
knn.fit(X, y)

KNeighborsClassifier()

In [33]:
pred1=knn.predict(X_test)
pred1

array([25., 24., 22., ...,  2., 28., 10.])

In [34]:
#F1 score
from sklearn import metrics

100*metrics.f1_score(y_test, pred1,average='weighted')

68.90631212124296

In [35]:
# Using XGBoost

In [36]:
from xgboost import XGBClassifier
xg=XGBClassifier()

In [37]:
xg.fit(X, y)

XGBClassifier(objective='multi:softprob')

In [38]:
pred2=xg.predict(X_test)
pred2

array([25., 24., 26., ...,  2., 28., 24.])

In [39]:
#F1 score

100*metrics.f1_score(y_test, pred2,average='weighted')

76.13305461815291

In [None]:
#Test data

In [41]:
df_tst=pd.read_csv('test.csv')
df_tst.head()

Unnamed: 0,video_id,description
0,tfNzMKvoVOU,States are taking a multi-million dollar gambl...
1,PjqKPHZJgF0,"Family Feud by Lil Wayne feat. Drake, off the ..."
2,KobCmaF10vQ,Black Nerd Review of Black Lightning Episode 1...
3,3D_ZODCSKXo,On this episode of Collider Movie Talk (Wednes...
4,6kyXZGyso8M,"Alabama head coach Nick Saban, freshman quarte..."


In [None]:
# preprocessing of test data

In [42]:
df_tst['description'] = df_tst['description'].apply(lambda x: re.sub('[^a-zA-Z\s]','',x))
df_tst['description'] = df_tst['description'].apply(lambda x: x.lower())
ts = re.compile(".*'t")
print(list(filter(ts.match, stop_words)))

["don't", "aren't", "couldn't", "didn't", "doesn't", "hadn't", "hasn't", "haven't", "isn't", "mightn't", "mustn't", "needn't", "shan't", "shouldn't", "wasn't", "weren't", "won't", "wouldn't"]


In [43]:
update_words = list(filter(ts.match, stop_words))

In [44]:
stopwrds = set([i for i in stop_words  if i not in update_words])

In [45]:
lemma = WordNetLemmatizer()
corpus = []

for i in df_tst['description']:
    tokens = i.split()
    clean_token = [lemma.lemmatize(i) for i in tokens if i not in stopwrds]
    corpus.append(' '.join(clean_token))
    
df_tst['preprocess_tst_description'] = corpus

In [46]:
df_tst.head()

Unnamed: 0,video_id,description,preprocess_tst_description
0,tfNzMKvoVOU,states are taking a multimillion dollar gamble...,state taking multimillion dollar gamble techno...
1,PjqKPHZJgF0,family feud by lil wayne feat drake off the de...,family feud lil wayne feat drake dedication mi...
2,KobCmaF10vQ,black nerd review of black lightning episode ...,black nerd review black lightning episode seri...
3,3D_ZODCSKXo,on this episode of collider movie talk wednesd...,episode collider movie talk wednesday january ...
4,6kyXZGyso8M,alabama head coach nick saban freshman quarter...,alabama head coach nick saban freshman quarter...


In [47]:
Xt = tokenizer.texts_to_sequences(df_tst['preprocess_tst_description'].values)
Xt = pad_sequences(Xt, maxlen=sentence_length)

Xt.shape

(2232, 475)

In [48]:
Xt[:5]

array([[    0,     0,     0, ...,  7550,     8,  7551],
       [    0,     0,     0, ...,   447,    11, 15964],
       [    0,     0,     0, ...,   242,  1596,  8144],
       [    0,     0,     0, ...,    70,    29,  9882],
       [    0,     0,     0, ...,  9152,   583,  4982]], dtype=int32)

In [49]:
pred_final=xg.predict(Xt)
pred_final

array([24., 10., 24., ..., 26., 24., 24.])

In [50]:
df_sample=pd.DataFrame()
df_sample['video_id']=df_tst['video_id']
df_sample['category_id']=pred_final

In [51]:
df_sample.head()

Unnamed: 0,video_id,category_id
0,tfNzMKvoVOU,24.0
1,PjqKPHZJgF0,10.0
2,KobCmaF10vQ,24.0
3,3D_ZODCSKXo,24.0
4,6kyXZGyso8M,24.0
