# Detecting Sarcasm in Reddit Comments – EDA and Data Pre-processing

**Team 4:** Nanda H Krishna, Rubini U and Vikram Reddy

**Checklist:**
1. [x] EDA and Pre-processing
2. [ ] TF-IDF (Random Forest, Gradient Boosting, Gaussian Naïve Bayes, Multi-Layer Perceptron, Neural Network, Linear SVM)
    - [ ] TF-IDF on Pre-processed Text
    - [ ] TF-IDF on Raw Text
    - [ ] Effect of using 2-grams
    - [ ] Effect of using PCA
    - [ ] Ensembling models
    - [ ] Model Interpretability
3. [ ] BERT Embeddings

## Importing Modules

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
import numpy as np
import pandas as pd
import string
from wordcloud import WordCloud

## Loading Dataset

In [None]:
df = pd.read_csv('sarcasm/sarcasm.csv')

In [None]:
df.head()

In [None]:
df.info()

## Data Visualisation and Pre-processing

### Wordcloud

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
text = ' '.join(df['comment'].astype(str))
wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
text = ' '.join(df['parent_comment'].astype(str))
wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### Some Statistics

In [None]:
x = df['comment'].apply(lambda x: len(str(x).split(' ')))
print('Max:', x.max(), 'Min:', x.min(), 'Mean:', int(x.mean()), "Median:", int(x.median()))
x.plot(kind='box')

In [None]:
x = df['parent_comment'].apply(lambda x: len(str(x).split(' ')))
print('Max:', x.max(), 'Min:', x.min(), 'Mean:', int(x.mean()), "Median:", int(x.median()))
x.plot(kind='box')

In [None]:
text = ' '.join(df[df['label'] == 1]['comment'].astype(str))
wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### Cleaning Text

In [None]:
def clean_text(text):
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stop_words]
    return ' '.join(text)

In [None]:
df['processed_comment'] = df['comment'].astype(str).apply(clean_text)

In [None]:
text = ' '.join(df['processed_comment'].astype(str))
wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
x = df['processed_comment'].apply(lambda x: len(str(x).split(' ')))
print('Max:', x.max(), 'Min:', x.min(), 'Mean:', int(x.mean()), "Median:", int(x.median()))
x.plot(kind='box')

In [None]:
df['processed_parent'] = df['parent_comment'].astype(str).apply(clean_text)

In [None]:
text = ' '.join(df['processed_parent'].astype(str))
wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
x = df['processed_parent'].apply(lambda x: len(str(x).split(' ')))
print('Max:', x.max(), 'Min:', x.min(), 'Mean:', int(x.mean()), "Median:", int(x.median()))
x.plot(kind='box')

In [None]:
text = ' '.join(df[df['label'] == 1]['processed_comment'].astype(str))
wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

### Lemmatising Text

In [None]:
l = nltk.stem.WordNetLemmatizer()

In [None]:
def lemmatise_text(text):
    text = [l.lemmatize(word) for word in text.split()]
    return ' '.join(text)

In [None]:
df['lemmatised_comment'] = df['processed_comment'].astype(str).apply(lemmatise_text)

In [None]:
text = ' '.join(df['lemmatised_comment'].astype(str))
wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
df['lemmatised_parent'] = df['processed_parent'].astype(str).apply(lemmatise_text)

In [None]:
text = ' '.join(df['lemmatised_parent'].astype(str))
wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
text = ' '.join(df[df['label'] == 1]['lemmatised_comment'].astype(str))
wordcloud = WordCloud(stopwords=stop_words, background_color='white').generate(text)
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.show()

## Saving Dataset

In [None]:
df.to_csv('sarcasm/dataset.csv', index=False)

In [None]:
df.head()