In [None]:
import numpy as np
import pandas as pd
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer

### We start by first loading our first dataset which contains the following information:

1. title: Title of the reddit post
2. created: Time at which the post was created
3. author : Name of the post's author
4. No. of comments : Number of comments on the post
5. url : URL of the reddit post
6. body : Body of the post if any
7. score: The number of upvotes minus the number of downvotes on the post
8. id: ID of the post

In [None]:
data = pd.read_csv("data3.csv")

### From a quick look at the raw data , we can make the following assumptions: 

1. The body column contains mostly nan values. 
2. The id column holds no specific significance both in classification of the post or in extracting any meaningful insights from the post. 
3. The created column contains only the time at which the post was created, which might not give any significant insights into the data. 

It is hence best to drop out these columns before we perform exploratory data analysis.

In [None]:
## Removing unwanted columns

data.drop(['body', 'id', 'created'], axis = 1, inplace = True)


## Feature Engineering

In this section I am adding certain additional features to my dataset, so as to study the text data in the title column better

1. title_len = Length of the title
2. word_count = Total no. of words in the title
3. polarity = Sentiment polarity of the given title on a scale of [-1 to 1] where -1 signifies most -ve and +1 most +ve

In [None]:
## adding extra features

data['title_len'] = data['title'].astype(str).apply(len)
data['word_count'] = data['title'].apply(lambda x: len(str(x).split()))
data["polarity"] = data["title"].map(lambda text: TextBlob(text).sentiment.polarity)
data

### Univariate Visualization

This step is performed in order to get the summary statistics for each field in the data set:
1. Sentiment Polarity Distribution
2. Title Word Count Distribution
3. Title Length Distribution
4. Mean scores for different flairs
5. Total number of comments in different flairs

It helps us to understand the dataset better, and gives us insight on what could be our approach towards cleaning of data

In [None]:
import matplotlib.pyplot as plt

### Sentiment Polarity Distribution in Title text

In [None]:
plt.hist(data["polarity"], bins = 10, edgecolor = 'black', color = 'purple', alpha = 0.5)
plt.title('Sentiment Polarity Distribution')
plt.show()

From the above histogram, it can be seen that most of the text data in title column is of neutral sentiment.

In [None]:
print('5 random posts with the highest positive sentiment polarity: \n')
cl = data.loc[data.polarity == 1, ['title']].sample(5).values
for c in cl:
    print(c[0])

In [None]:
print('5 random posts with the highest neutral sentiment polarity: \n')
cl = data.loc[data.polarity == 0, ['title']].sample(5).values
for c in cl:
    print(c[0])

In [None]:
print('5 random posts with the highest negative sentiment polarity: \n')
cl = data.loc[data.polarity == -1, ['title']].sample(5).values
for c in cl:
    print(c[0])

### Word count distribution of title text

In [None]:
plt.hist(data["word_count"], bins = 10, edgecolor = 'black', color = 'blue', alpha = 0.5)
plt.title('Word Count Distribution')
plt.show()

### Title Length Distribution

In [None]:
plt.hist(data["title_len"], bins = 10, edgecolor = 'black', color = 'red', alpha = 0.5)
plt.title('Title Length Distribution')
plt.show()

The title length distribution shows that the title text is relatively smaller in terms of the number of characters

### Mean Scores of Different Flairs:

A submission's score in reddit is simply the number of upvotes minus the number of downvotes. By studying the mean scores for different flairs, we get to see how popular different reddit flairs are among the users. It might tell us if we can also use scores as a distinguishing feature in flairs

In [None]:
## Finding the mean scores

scores = {}
for f in data["flair"].unique():
    scores[f] = data[data["flair"]==f].describe()["score"].mean()
scores

In [None]:
## Plotting the mean scores

plt.bar(x = list(scores.keys()), height = list(scores.values()), color = 'magenta', edgecolor = 'black', alpha = 0.5)
plt.xticks(rotation=80)
plt.xlabel("Flair")
plt.ylabel("Mean Score")
plt.title("Mean score of different flairs")
plt.grid()
plt.show()

### Number of comments in different flairs


In [None]:
flairs = list(data.groupby(['flair'])['No. of comments'].sum().index)
sum_of_no_of_comments = list(data.groupby(['flair'])['No. of comments'].sum().values)

In [None]:
plt.pie(sum_of_no_of_comments,labels = flairs, explode = [0.15, 0.15 ,0.15 ,0.15, 0.15, 0.15 ,0.15, 0.15, 0.15, 0.15, 0.15], autopct="%.1f%%")
plt.title("Total Number of comments for different flairs\n")
plt.show()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

## Frequently occuring words in title text

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words(data['title'], 20)
df1 = pd.DataFrame(common_words, columns = ['title' , 'count'])
words = list(df1.groupby('title').sum()['count'].sort_values(ascending=False).index)
count = list(df1.groupby('title').sum()['count'].sort_values(ascending=False).values)

### Before Removing stopwords

In [None]:
plt.bar(words, count, color = 'blue', edgecolor = 'black', alpha = 0.5)
plt.title("Top 20 words in title text before removing stop words")
plt.xticks(rotation = 80)
plt.show()

### After removing stopwords

In [None]:
def get_top_n_words_stop(corpus, n=None):
    vec = CountVectorizer(stop_words = 'english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]
common_words = get_top_n_words_stop(data['title'], 20)
df2 = pd.DataFrame(common_words, columns = ['title' , 'count'])
df2.groupby('title').sum()['count'].sort_values(ascending=False)
words = list(df2.groupby('title').sum()['count'].sort_values(ascending=False).index)
count = list(df2.groupby('title').sum()['count'].sort_values(ascending=False).values)

In [None]:
plt.bar(words, count, color = 'red', edgecolor = 'black', alpha = 0.5)
plt.title("Top 20 words in title text after removing stop words")
plt.xticks(rotation = 80)
plt.show()

#### The above visualization stresses on the importance of removing stopwords from our dataset as they are present in all categories of text and hold little importance when it comes to classification
#### After removal of stopwords, we find that few words are present in large numbers, such as coronavirus, which is present  in almost all of the categories. This may pose a problem when it comes to classification as due to the ongoing covid19 crisis, the content of a lot of flairs are similar and revolve around coronavirus. This may confuse our classifier.


# Now exploring comments data

## EDA on comments data

Since the comments data is large and contains a lot of impurity in terms of symbols and characters, we need to preprocess the data a little so that we can get better insights in the actual content of data.

1. We first start by loading the data from a csv file to a pandas dataframe
2. Next we preprocess the comments data and remove the bad symbols
3. Since the body of comments includes various top comments, we separate sentences to form a new dataframe
4. Next we perform feature engineering on our data, ie. adding additional features such as comment lenghts, sentiment polarity and word count.
5. After this, we perform univariate visualization on the data 
6. A visualisation of the top words before and after removing stopwords is also performed on the given data.

In [None]:
# 1. Loading data
comment_data = pd.read_csv("data4.csv")
comment_data

In [None]:
comment_data['body'][99]

#### From a look at the raw data above, we can see that the comment data is highly impure and contains a lot of bad symbols, poor formatting, hindi/hinglish words which may make it difficult for us both for performing EDA and later in classification

## Preproccesing the data before EDA

In [None]:
## Removing bad words

def preprocess(comments):
    comments = comments.str.replace("//", "")
    comments = comments.str.replace('[',  "")
    comments = comments.str.replace('=',"")
    comments = comments.str.replace(']',"")
    comments = comments.str.replace('[',"")
    comments = comments.str.replace(')', '')
    comments = comments.str.replace('(', '')
    comments = comments.str.replace('\\n', '')
    comments = comments.str.replace('\\t', '')
    comments = comments.str.replace('\\', '')
    comments = comments.str.replace('@', '')  
    comments = comments.str.replace('<', '') 
    comments = comments.str.replace('>', '') 

    return comments
comment_data["body"] = preprocess(comment_data['body'])

In [None]:
#import re
#comment_data['body'] = comment_data['body'].apply(lambda comment : re.sub(r'[\xe2\x80\x99s]', '', str(comment)))

In [None]:
## breaking the data into separate sentences along with their respective flairs

In [None]:
from nltk.tokenize import sent_tokenize

In [None]:
flairs = np.array(comment_data["flair"])
comment_text = np.array(comment_data["body"])

In [None]:
f = []
c = []
for i in range(len(comment_text)):
    comments_list = sent_tokenize(str(comment_text[i]))
    for comment in comments_list:
        f.append(flairs[i])
        c.append(comment.lower()) 
f = np.array(f)
c = np.array(c)

In [None]:
c.shape, f.shape

In [None]:
d = []
for i in range(len(f)):
    d.append((f[i],c[i]))
d[0:5]

In [None]:
comment_data = pd.DataFrame(d, columns = ["flair", "body"])

In [None]:
comment_data

In [None]:
## Removing NaN values from the data

In [None]:
comment_data = comment_data.dropna()

In [None]:
comment_data.describe()

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

## Feature Engineering

In [None]:
## adding extra features
comment_data['comments_len'] = comment_data['body'].astype(str).apply(len)
comment_data['word_count'] = comment_data['body'].apply(lambda x: len(str(x).split()))
comment_data["polarity"] = comment_data["body"].map(lambda text: TextBlob(str(text)).sentiment.polarity)
comment_data

## Univariate Visualization

In [None]:
import matplotlib.pyplot as plt
plt.hist(comment_data["polarity"], bins = 10, edgecolor = 'black', color = 'purple', alpha = 0.5)
plt.title('Sentiment Polarity Distribution(Comments)')
plt.show()

In [None]:
comment_data['word_count'].describe()

In [None]:
plt.hist(comment_data["word_count"], bins = 50, edgecolor = 'black', color = 'blue', alpha = 0.5)
plt.title('Word Count Distribution (Comments)')
plt.show()

In [None]:
plt.hist(comment_data["comments_len"], bins = 50, edgecolor = 'black', color = 'red', alpha = 0.5)
plt.title('Comment Length Distribution')
plt.show()

In [None]:
common_words = get_top_n_words(comment_data['body'], 20)
df1 = pd.DataFrame(common_words, columns = ['comment' , 'count'])
words = list(df1.groupby('comment').sum()['count'].sort_values(ascending=False).index)
count = list(df1.groupby('comment').sum()['count'].sort_values(ascending=False).values)

In [None]:
plt.bar(words, count, color = 'navy', edgecolor = 'black', alpha = 0.5)
plt.title("Top 20 words in title text before removing stop words")
plt.xticks(rotation = 80)
plt.show()

In [None]:
common_words = get_top_n_words_stop(comment_data['body'], 20)
df2 = pd.DataFrame(common_words, columns = ['comment' , 'count'])
df2.groupby('comment').sum()['count'].sort_values(ascending=False)
words = list(df2.groupby('comment').sum()['count'].sort_values(ascending=False).index)
count = list(df2.groupby('comment').sum()['count'].sort_values(ascending=False).values)
plt.bar(words, count, color = 'violet', edgecolor = 'black', alpha = 0.5)
plt.title("Top 20 words in title text after removing stop words")
plt.xticks(rotation = 80)
plt.show()

In [None]:
print('5 random posts with the highest positive sentiment polarity: \n')
cl = comment_data.loc[comment_data.polarity == 1, ['body']].sample(5).values
for c in cl:
    print(c[0])

In [None]:
print('5 random posts with the highest positive sentiment polarity: \n')
cl = comment_data.loc[comment_data.polarity == 0, ['body']].sample(5).values
for c in cl:
    print(c[0])

In [None]:
print('5 random posts with the highest positive sentiment polarity: \n')
cl = comment_data.loc[comment_data.polarity == -1, ['body']].sample(5).values
for c in cl:
    print(c[0])

#### One of the biggest issue with comments data is that comments in r/India posts are mostly in hindi, hinglish or poor english.
#### There are also a lot of emojis, abuse words, etc which the classifier may not understand.