# PhD odyssey sentiment analysis 

## Expected directory structure


* Dataset <- Store all the files in .txt 

## In this Notebook:

* Data pre-processing

      1. Convert the word and pdf files in .txt
      2. Split the text in line with author name 
      3. Remove special characters
      4. POS tag
      5. Lemmatization


* PhD odyssey sentiment analysis using  

      1. Vader
      2. TextBlob
      3. Flair (Flair pretrained sentiment analysis model is trained on IMDB dataset)
      4. SentiWordNet (I also used this one, but I think, it is not working well for this data)
      
* Output 

      1. Positive sentence
      2. Negative sentence
      3. Neutral sentence
      


## Libraries

In [2]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import re
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk import pos_tag
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('wordnet')
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob
nltk.download('sentiwordnet')
from nltk.corpus import sentiwordnet as swn
from flair.models import TextClassifier
from flair.data import Sentence

ModuleNotFoundError: No module named 'pandas'

## Create DF

In [None]:
def create_df(folder_path):
    files = [f for f in listdir(folder_path) if isfile(join(folder_path, f))]
    
    df_list = []
    
    for file in files:
        file_path_open = folder_path+ "/" + file
        with open(file_path_open, encoding="utf8", errors='ignore') as f:
            file_read = f.read()
            
            text_split = file_read.split(".")
            for i in range(0,len(text_split),1):
                remove_newline = text_split[i].replace('\n', ' ').replace('\r', '')
                if len(remove_newline.split(" "))>2:
                    df_list.append([remove_newline,file.replace('.txt','')])
                
    return df_list

In [None]:
folder_path = 'Dataset'

In [None]:
pd_list = create_df(folder_path)

In [None]:
len(pd_list)

In [None]:
df = pd.DataFrame(pd_list, columns = ['Sentence', 'Author_name'])

In [None]:
df.Author_name.value_counts()

In [None]:
df.to_csv('phd_odyssey_sentiment_analysis.csv')

## Data preprocessing

Removes all special characters and numericals leaving the alphabets

In [None]:
def clean(text):
    text = re.sub('[^A-Za-z]+', ' ', text)
    text = text.replace('\r', '').replace('\n', '')
    return text

POS tagger dictionary

In [None]:
pos_dict = {'J':wordnet.ADJ, 'V':wordnet.VERB, 'N':wordnet.NOUN, 'R':wordnet.ADV}

def token_stop_pos(text):
    tags = pos_tag(word_tokenize(text))
    newlist = []
    for word, tag in tags:
        if word.lower() not in set(stopwords.words('english')):
            newlist.append(tuple([word, pos_dict.get(tag[0])]))
    return newlist

Obtaining the stem words – Lemmatization

In [None]:
def lemmatize(pos_data):
    lemma_rew = " "
    for word, pos in pos_data:
        if not pos:
            lemma = word
            lemma_rew = lemma_rew + " " + lemma
        else:
            lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
            lemma_rew = lemma_rew + " " + lemma
    return lemma_rew

In [None]:
df['Sentence_clean'] = df['Sentence'].apply(clean)

In [None]:
df['POS_tagged'] = df['Sentence_clean'].apply(token_stop_pos)

In [None]:
df['Lemma'] = df['POS_tagged'].apply(lemmatize)

In [None]:
df = df.loc[df['Lemma'].str.contains(r'[^\x00-\x7F]+') == False]

In [None]:
df

In [None]:
fin_data = pd.DataFrame(df[['Sentence', 'Author_name', 'Lemma']])

## Sentiment Analysis using VADER

In [None]:
analyzer = SentimentIntensityAnalyzer()

function to calculate vader sentiment

In [None]:
def vadersentimentanalysis(review):
    vs = analyzer.polarity_scores(review)
    return vs['compound']

In [None]:
# function to analyse
def vader_analysis(compound):
    if compound >= 0.5:
        return 'Positive'
    elif compound <= -0.5 :
        return 'Negative'
    else:
        return 'Neutral'

In [None]:
fin_data['Vader_Sentiment'] = fin_data['Lemma'].apply(vadersentimentanalysis)

In [None]:
fin_data['Vader_Analysis'] = fin_data['Vader_Sentiment'].apply(vader_analysis)

In [None]:
fin_data

In [None]:
vader_counts = fin_data['Vader_Analysis'].value_counts()
vader_counts

## Sentiment Analysis using TextBlob

function to calculate subjectivity

In [None]:
def getSubjectivity(review):
    return TextBlob(review).sentiment.subjectivity

function to calculate polarity

In [None]:
def getPolarity(review):
    return TextBlob(review).sentiment.polarity

function to analyze the reviews

In [None]:
def analysis(score):
    if score < 0:
        return 'Negative'
    elif score == 0:
        return 'Neutral'
    else:
        return 'Positive'

In [None]:
fin_data['TextBlob_Polarity'] = fin_data['Lemma'].apply(getPolarity) 
fin_data['TextBlob_Analysis'] = fin_data['TextBlob_Polarity'].apply(analysis)
fin_data.head()

In [None]:
tb_counts = fin_data.TextBlob_Analysis.value_counts()
tb_counts

## Sentiment Analysis using Flair

In [None]:
def flair_senti(text):
    sentence = Sentence(text)
    senti = classifier.predict(sentence)
    return senti

In [None]:
classifier = TextClassifier.load('en-sentiment')

In [None]:
fin_data['Flair_Analysis'] = fin_data['Lemma'].apply(flair_senti)


## Sentiment Analysis using SentiWordNet

In [None]:
def sentiwordnetanalysis(pos_data):
    sentiment = 0
    tokens_count = 0
    for word, pos in pos_data:
        if not pos:
            continue
            
        lemma = wordnet_lemmatizer.lemmatize(word, pos=pos)
        if not lemma:
            continue
            
        synsets = wordnet.synsets(lemma, pos=pos)
        if not synsets:
            continue
            # Take the first sense, the most common
            synset = synsets[0]
            swn_synset = swn.senti_synset(synset.name())
            sentiment += swn_synset.pos_score() - swn_synset.neg_score()
            tokens_count += 1
            # print(swn_synset.pos_score(),swn_synset.neg_score(),swn_synset.obj_score())
        if not tokens_count:
            return 0
        if sentiment>0:
            return "Positive"
        if sentiment==0:
            return "Neutral"
        else:
            return "Negative"

In [None]:
fin_data['SWN_Polarity'] = df['POS_tagged'].apply(sentiwordnetanalysis)

In [None]:
fin_data['SWN_Analysis'] = fin_data['SWN_Polarity'].apply(analysis)

In [None]:
fin_data.head(4)

In [None]:
swn_counts= fin_data['SWN_Analysis'].value_counts()
swn_counts

In [None]:
result_df = pd.DataFrame(fin_data[['Sentence', 'Author_name', 'Vader_Sentiment','Vader_Analysis','TextBlob_Polarity',
                                  'TextBlob_Analysis']])

In [None]:
result_df.Sentence[33]

In [None]:
result_df.to_csv('result.csv')

## Visual representation of TextBlob, VADER, SentiWordNet results

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline
plt.figure(figsize=(15,7))
plt.subplot(1,3,1)
plt.title("TextBlob results")
plt.pie(tb_counts.values, labels = tb_counts.index, explode = (0, 0, 0.25), autopct='%1.1f%%', shadow=False)
plt.subplot(1,3,2)
plt.title("VADER results")
plt.pie(vader_counts.values, labels = vader_counts.index, explode = (0, 0, 0.25), autopct='%1.1f%%', shadow=False)
# plt.subplot(1,3,3)
# plt.title("SentiWordNet results")
# plt.pie(swn_counts.values, labels = swn_counts.index, explode = (0, 0, 0.25), autopct='%1.1f%%', shadow=False)