Here we will perform Text Classification on the Instagram caption Using Support Vector Machine algorithm and also by using the TF-IDF method.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.externals import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.svm import SVC
import pickle
from sklearn.externals import joblib
import re



In [2]:
train=pd.read_csv('woytrain.csv')
train

Unnamed: 0,username,caption,labelling
0,rajvegad055,viral top instatop public photography ed...,1
1,flexkulture,fashion fashionista streetstyle streetwear...,1
2,ishovonn,insta instagram travelgram travel instahu...,1
3,thesundayclub_,tas taswanita tascewek tasimport tasmurah ...,1
4,ay_mumtaz,bajumuslim style fashion atasan bajuatasa...,1
...,...,...,...
66166,macon_travel_easy,macay luxuryvacationrentals workwiththebest...,5
66167,yogabudiman94,holiday pulaupramuka pramukaisland,5
66168,plasatravel.id,tiketmurah travel plasatravel paketliburan,5
66169,frise_invaligia,ulivet pugli carovign ital iamt travelre...,5


In [3]:
train['labelling'] = train['labelling'].map({1 : 'Fashion',
                                    2 : 'Food & Beverage',
                                    3 : 'Technology',
                                    4 : 'Health & Beauty',
                                    5 : 'Lifestyle & Travel'})

In [4]:
# remove special characters, numbers, punctuations
train['caption'] = train['caption'].str.replace("[^a-zA-Z#]", " ")

#Remove double/multiple space
train['caption'] = train['caption'].apply(lambda x: ' '.join(str(x).split()))

#lowercase word
train['caption'] = train['caption'].apply(lambda x: x.lower())

#remove emoticon
def Emoticon_clean(text):
    regrex_pattern = re.compile(pattern = "["
                    u"\U0001F600-\U0001F64F"  # emoticons
                    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                    u"\U0001F680-\U0001F6FF"  # transport & map symbols
                    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                    u"\U0001F1F2-\U0001F1F4"  # Macau flag
                    u"\U0001F1E6-\U0001F1FF"  # flags
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

train['caption'] = train['caption'].apply(lambda x: Emoticon_clean(x))

In [5]:
train.head(50)

Unnamed: 0,username,caption,labelling
0,rajvegad055,viral top instatop public photography editz po...,Fashion
1,flexkulture,fashion fashionista streetstyle streetwearfash...,Fashion
2,ishovonn,insta instagram travelgram travel instahub ins...,Fashion
3,thesundayclub_,tas taswanita tascewek tasimport tasmurah tasm...,Fashion
4,ay_mumtaz,bajumuslim style fashion atasan bajuatasan rok...,Fashion
5,hausofkochi,fashion dubaifashion dolceandgabbana fashionav...,Fashion
6,p.carmila_putrinovianti99,olahraga style selebgram folow folow hunting p...,Fashion
7,mfadli0,like like followersindonesia visitsumut medan ...,Fashion
8,mithra.us,mithra mensfashion fashion style menstyle ootd...,Fashion
9,beautytips.blog,dewiroesdji dewiqu fotos fashionblog fashionde...,Fashion


In [6]:
caption = train['caption']

### Count The Value of TF-IDF

In [7]:
count_vect = CountVectorizer()
caption_count = count_vect.fit_transform(caption)
tfidf = TfidfTransformer()
tfidf_caption = tfidf.fit_transform(caption_count)

In [8]:
#get the feature names
feature_names = count_vect.get_feature_names()

#get tfidf vector for first document
first_document_vector=tfidf_caption[0]

#print the scores
df = pd.DataFrame(first_document_vector.T.todense(), index=feature_names, columns=["tfidf"])
df = df.sort_values(by=["tfidf"],ascending=False)
df.head(10)

Unnamed: 0,tfidf
editz,0.386787
instatop,0.381003
actorslife,0.316553
public,0.313772
attitude,0.288668
bollywood,0.272849
pose,0.26266
models,0.244513
top,0.241793
hairstyle,0.24063


In [9]:
count_vect.vocabulary_

{'viral': 92833,
 'top': 88450,
 'instatop': 40107,
 'public': 71506,
 'photography': 68526,
 'editz': 22718,
 'pose': 70262,
 'models': 58106,
 'look': 51779,
 'attitude': 4244,
 'style': 83283,
 'bollywood': 10679,
 'actorslife': 427,
 'hairstyle': 33145,
 'fashion': 25515,
 'fashionista': 25675,
 'streetstyle': 83078,
 'streetwearfashion': 83103,
 'streetwear': 83097,
 'hype': 36837,
 'hypebeast': 36842,
 'highfashion': 34901,
 'offwhite': 63523,
 'supreme': 84308,
 'bape': 6525,
 'balenciaga': 5845,
 'louisvuitton': 51967,
 'gucci': 32574,
 'yeezy': 96706,
 'lit': 51151,
 'fire': 26629,
 'drip': 21777,
 'trending': 89802,
 'trend': 89798,
 'trendy': 89822,
 'trendsetter': 89820,
 'fashionblogger': 25559,
 'streetstylefashion': 83080,
 'culture': 18345,
 'sneakers': 80769,
 'insta': 39304,
 'instagram': 39618,
 'travelgram': 89254,
 'travel': 89059,
 'instahub': 39735,
 'instacool': 39432,
 'instagramhub': 39650,
 'nature': 61021,
 'landscape': 49141,
 'landscapephotography': 49154,

In [10]:
#Save countvectorizer to pickle
joblib.dump(count_vect, 'count_vec.pkl')

#save tfidftransformer to pickle
joblib.dump(tfidf, 'tfidf.pkl')

['tfidf.pkl']

In [12]:
labels = train['labelling']
text = tfidf_caption
Train_X, Test_X, Train_Y, Test_Y = train_test_split(text,labels,test_size=0.3, stratify=labels, random_state=42)

svc_rbf = SVC()
svc_train_rbf = svc_rbf.fit(Train_X, Train_Y)
score_svc_rbf = (svc_train_rbf.score(Train_X, Train_Y) * 100)
print("Prediction score of Support Vector Classifier is {:.2f} %".format(score_svc_rbf))

Prediction score of Support Vector Classifier is 92.66 %


### Prediction for Data Testing

In [13]:
# take Model SVM from pickle
joblib.dump(svc_rbf, 'model_svc.pkl') 

['model_svc.pkl']

In [14]:
svc_rbf_predict = svc_rbf.predict(Test_X)

### Evaluation

This F1-score can be used as a benchmark for evaluating classification results, when the class in the data has a class imbalance. This imbalanced class occurs if the ratio in each class is not proportional. The result will be showed in Classification report

In [16]:
from sklearn.metrics import accuracy_score, classification_report
accuracy_score = (accuracy_score(Test_Y, svc_rbf_predict) * 100)
print(classification_report(Test_Y, svc_rbf_predict))
print("The result of Accuracy score of Data Testing is {:.2f} %".format(accuracy_score))

                    precision    recall  f1-score   support

           Fashion       0.83      0.82      0.83      3792
   Food & Beverage       0.92      0.87      0.90      2501
   Health & Beauty       0.89      0.95      0.92      6845
Lifestyle & Travel       0.90      0.87      0.89      6298
        Technology       0.92      0.81      0.86       416

          accuracy                           0.89     19852
         macro avg       0.90      0.87      0.88     19852
      weighted avg       0.89      0.89      0.89     19852

The result of Accuracy score of Data Testing is 88.95 %
