# Settings

In [1]:
import os
import unicodedata
from io import open
import csv

In [2]:
import nltk
from nltk.sentiment import vader
from nltk.sentiment.vader import SentimentIntensityAnalyzer

In [7]:
# nltk.download('vader_lexicon', quiet=False)

In [8]:
vader_model = SentimentIntensityAnalyzer()

# Import data

In [9]:
file_name = 'twitter_en_big.txt'
path = os.path.join("chat_corpus/", file_name)

# Perform sentiment analysis on responses

In [10]:
tweets=[]
lines = {}
with open(path, 'r', encoding='iso-8859-1') as f:
    print(f)
    for i, line in enumerate(f):
        if i % 2 == 0:
            lines['question'] = line
            lines['senti_question'] = vader_model.polarity_scores(line)
        else:
            lines['respond'] = line
            lines['senti_respond'] = scores = vader_model.polarity_scores(line)
        
        if i % 2 == 1:
            tweets.append(lines)
            lines = {}

<_io.TextIOWrapper name='chat_corpus/twitter_en_big.txt' mode='r' encoding='iso-8859-1'>


In [11]:
print(len(tweets))

2601244


In [38]:
example

[{'question': 'whitney- i dont think shane wants to work with us. alex- i think shane has made a deal with everyone in the house. whitney- i do too\n',
  'respond': "the paranoia has already began...why do they think shane doesn't wanna work with them? because he's not up their butt?\n",
  'senti_respond': {'compound': -0.3313,
   'neg': 0.11,
   'neu': 0.89,
   'pos': 0.0}},
 {'question': 'if you see any clowns tn lmk. me and the boys cleaning the streetsð\x9f\x98¤ð\x9f\x98¤\n',
  'respond': 'white ford boys ð\x9f\x98¤ð\x9f\x98¤ð\x9f\x98\x82\n',
  'senti_respond': {'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0}},
 {'question': 'when you find so you decide to stay in her class ð\x9f\x98\x82ð\x9f\x98\x89\n',
  'respond': "too bad you don't actually go to my school you liar ð\x9f\x98\x82\n",
  'senti_respond': {'compound': -0.7783,
   'neg': 0.405,
   'neu': 0.595,
   'pos': 0.0}},
 {'question': 'psych is being taken off of netflix tomorrow and i have 8 episodes left and i work ton

# Filter tweets on compound score

In [22]:
def get_tweets(data, threshold):
    result_pos = []
    result_neg = []
    for tweet in data:
        if tweet['senti_respond']['compound'] < -1 * threshold:
            result_neg.append(tweet)
        elif  tweet['senti_respond']['compound'] > threshold:
            result_pos.append(tweet)
    
    return([result_pos, result_neg])

In [23]:
positive, negative = get_tweets(tweets, 0.5)

In [24]:
print(len(negative))
print(len(positive))

289147
566118


In [15]:
negative[0]

{'question': 'when you find so you decide to stay in her class ð\x9f\x98\x82ð\x9f\x98\x89\n',
 'respond': "too bad you don't actually go to my school you liar ð\x9f\x98\x82\n",
 'senti_question': {'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0},
 'senti_respond': {'compound': -0.7783,
  'neg': 0.405,
  'neu': 0.595,
  'pos': 0.0}}

In [16]:
negative[2000]

{'question': 'not sure what u mean. and does this mean u dont agree to everyone having a respectful discussion?\n',
 'respond': "not disagreeing w/ya. just saying i'm the witch and no one else is 2 blame.\n",
 'senti_question': {'compound': -0.0186,
  'neg': 0.214,
  'neu': 0.629,
  'pos': 0.157},
 'senti_respond': {'compound': -0.6204,
  'neg': 0.371,
  'neu': 0.523,
  'pos': 0.106}}

# Save dataset

In [150]:
# Extracts pairs of sentences from conversations
def extractSentencePairs(conversations):
    qa_pairs = []
    for conversation in conversations:
        # Iterate over all the lines of the conversation
        inputLine = conversation['question'].strip()
        targetLine = conversation['respond'].strip()
        # Filter wrong samples (if one of the lists is empty)
        if inputLine and targetLine:
            qa_pairs.append([inputLine, targetLine])
    return qa_pairs

In [156]:
datafile = "chat_corpus/chatbot_tweets_neg.txt"

In [157]:
# Write new csv file
print("\nWriting newly formatted file...")
with open(datafile, 'w', encoding='utf-8') as outputfile:
    writer = csv.writer(outputfile, delimiter="=", lineterminator='\n')
    for pair in extractSentencePairs(negative):
        writer.writerow(pair)


Writing newly formatted file...


# Investigate question sentiment

In [25]:
negative[0]

{'question': 'when you find so you decide to stay in her class ð\x9f\x98\x82ð\x9f\x98\x89\n',
 'respond': "too bad you don't actually go to my school you liar ð\x9f\x98\x82\n",
 'senti_question': {'compound': 0.0, 'neg': 0.0, 'neu': 1.0, 'pos': 0.0},
 'senti_respond': {'compound': -0.7783,
  'neg': 0.405,
  'neu': 0.595,
  'pos': 0.0}}

In [29]:
amount_positive = 0
amount_neutral = 0
amount_negative = 0

for tweet in negative:
    #print(tweet['senti_question']['compound'])
    compound_score = tweet['senti_question']['compound']
    if compound_score < -0.3:
        amount_negative += 1
    elif compound_score > 0.3:
        amount_positive += 1
    else:
        amount_neutral += 1

In [33]:
print(amount_negative)
print(amount_positive)
print(amount_neutral)

91494
74860
122793


In [34]:
amount_positive = 0
amount_neutral = 0
amount_negative = 0

for tweet in positive:
    #print(tweet['senti_question']['compound'])
    compound_score = tweet['senti_question']['compound']
    if compound_score < -0.3:
        amount_negative += 1
    elif compound_score > 0.3:
        amount_positive += 1
    else:
        amount_neutral += 1

In [35]:
print(amount_negative)
print(amount_positive)
print(amount_neutral)

83086
255310
227722
