# Preparing the dataset:
# Splitting the dataset into a training set and a set with new data

In [1]:
import pandas as pd
import numpy as np

In [2]:
post_comments_labelled = pd.read_csv('post_comments_labelled.csv', delimiter=';')

In [3]:
post_comments_labelled

Unnamed: 0,id,url,comment_author,contents,time_posted,user_name_post_author,Sentiment
0,17.0,https://www.instagram.com/p/B70BFS2gx14/,follstore_idn,LikesViewsFollowersNya!,2020-01-27 07:18:28+01,instagram,Neutral
1,18.0,https://www.instagram.com/p/B70BFS2gx14/,bidobidorised_,❤️❤️,2020-01-27 07:18:28+01,instagram,Positive
2,19.0,https://www.instagram.com/p/B70BFS2gx14/,hadirezaeimusic,💖❤,2020-01-27 07:18:28+01,instagram,Positive
3,20.0,https://www.instagram.com/p/B70BFS2gx14/,sonurai5762,❤️❤️🔥❤️🔥,2020-01-27 07:18:28+01,instagram,Positive
4,21.0,https://www.instagram.com/p/B70BFS2gx14/,volkancatikkas,👽✌🏻,2020-01-27 07:18:28+01,instagram,Neutral
...,...,...,...,...,...,...,...
23825,,,,I don’t think size has much to do with feminin...,,,Negative
23826,,,,No. No. And no. Only people with a sick mind c...,,,Negative
23827,,,,"I really support her confidence,but obesity is...",,,Negative
23828,,,,Obesity is dangerous. How can you support that!?,,,Negative


In [4]:
post_comments_labelled.count()

id                       23819
url                      23819
comment_author           23819
contents                 23830
time_posted              23820
user_name_post_author    23819
Sentiment                  424
dtype: int64

In [5]:
post_comments_labelled = post_comments_labelled.drop(post_comments_labelled[post_comments_labelled.contents == 'Bestätigt'].index)

In [6]:
post_comments_labelled.to_csv('post_comments_labelled.csv', index=False)

In [7]:
post_comments_1 = post_comments_labelled.loc[post_comments_labelled['Sentiment']=='Positive']

In [8]:
post_comments_2 = post_comments_labelled.loc[post_comments_labelled['Sentiment']=='Negative']

In [9]:
post_comments_3 = post_comments_labelled.loc[post_comments_labelled['Sentiment']=='Neutral']

In [10]:
frames = [post_comments_1, post_comments_2, post_comments_3]

In [11]:
post_comments_train = pd.concat(frames)

In [12]:
post_comments_train.head()

Unnamed: 0,id,url,comment_author,contents,time_posted,user_name_post_author,Sentiment
1,18.0,https://www.instagram.com/p/B70BFS2gx14/,bidobidorised_,❤️❤️,2020-01-27 07:18:28+01,instagram,Positive
2,19.0,https://www.instagram.com/p/B70BFS2gx14/,hadirezaeimusic,💖❤,2020-01-27 07:18:28+01,instagram,Positive
3,20.0,https://www.instagram.com/p/B70BFS2gx14/,sonurai5762,❤️❤️🔥❤️🔥,2020-01-27 07:18:28+01,instagram,Positive
5,22.0,https://www.instagram.com/p/B70BFS2gx14/,md_qizar_01,👌👌👌,2020-01-27 07:18:28+01,instagram,Positive
6,23.0,https://www.instagram.com/p/B70BFS2gx14/,md_qizar_01,🔥🔥🔥,2020-01-27 07:18:28+01,instagram,Positive


In [13]:
post_comments_train.count()

id                       413
url                      413
comment_author           413
contents                 424
time_posted              414
user_name_post_author    413
Sentiment                424
dtype: int64

shuffing the rows in the train dataframe:

In [14]:
post_comments_train_shuffled = post_comments_train.sample(frac=1).reset_index(drop=True)

turning emojis into their written names:

In [15]:
import emoji

In [16]:
post_comments_train_shuffled['contents'] = post_comments_train_shuffled['contents'].apply(emoji.demojize)

In [17]:
post_comments_train_shuffled.head()

Unnamed: 0,id,url,comment_author,contents,time_posted,user_name_post_author,Sentiment
0,318.0,https://www.instagram.com/p/B7ZC-4wFVVH/,ridwnsomawijaya,:fire:,2020-01-16 19:55:34+01,instagram,Positive
1,176.0,https://www.instagram.com/p/B7UBbVHFxH9/,trulytingz,I LOVE YOU,2020-01-14 21:05:57+01,arianagrande,Positive
2,300.0,https://www.instagram.com/p/B7bd1APhTU4/,chien8112,@siaoyisin @jbm_rich 學起來:rolling_on_the_floor_...,2020-01-17 18:28:38+01,instagram,Neutral
3,1280.0,https://www.instagram.com/p/B6Gqwv3AkmM/,official.0341,:face_with_open_mouth::face_with_open_mouth:,2019-12-15T19:06:12.000Z,instagram,Neutral
4,11149.0,https://www.instagram.com/p/B3nEAzdgJVM/,amirhossein_mie,Me models irani,2019-10-14T19:28:22.000Z,nike,Neutral


saving the training dataframe into a csv file:

In [23]:
post_comments_train_shuffled.to_csv('post_comments_train.csv', index=False)

preparing the new unlabelled data:

In [24]:
post_comments_newdata = post_comments_labelled[post_comments_labelled.Sentiment.isna()]

shuffiling the newdata dataframe:

In [25]:
post_comments_newdata_shuffled = post_comments_newdata.sample(frac=1).reset_index(drop=True)

In [26]:
post_comments_newdata_shuffled['contents'] = post_comments_newdata_shuffled['contents'].apply(emoji.demojize)

In [28]:
from langdetect import DetectorFactory

In [30]:
from langdetect import detect

In [36]:
from langdetect.lang_detect_exception import LangDetectException

In [29]:
DetectorFactory.seed = 0

In [37]:
for x in post_comments_newdata_shuffled['contents']:
    try:
        if detect(x) != 'en':
            post_comments_newdata_shuffled.drop(post_comments_newdata_shuffled[post_comments_newdata_shuffled.contents == x].index)
    except LangDetectException:
        pass

and then saving the dataframe with new data into a csv file:

In [40]:
post_comments_newdata_shuffled.to_csv('post_comments_newdata.csv', index=False)

preparing a first batch of test data from the new data:

In [41]:
post_comments_test = post_comments_newdata_shuffled.loc[:100]

In [42]:
post_comments_test.to_csv('post_comments_test_NB.csv', index=False)