In [1]:
import glob
import os
import pandas as pd

In [2]:
inputfile = str(os.path.dirname(os.getcwd())) + "/A4/tweets/*.csv"
outputfile = str(os.path.dirname(os.getcwd())) + "/A4/tweets/all.csv"
csv_list = glob.glob(inputfile)

In [3]:
filepath = csv_list[0]
df = pd.read_csv(filepath, encoding="gbk", low_memory=False)
df = df.text
df = df.to_csv(outputfile, encoding="gbk", index=False)

In [4]:
# Combine csv files and only select the text field, save into all.csv
for i in range(1, len(csv_list)):
    filepath = csv_list[i]
    df = pd.read_csv(filepath, encoding="gbk", low_memory=False)
    df = df.text
    df = df.to_csv(outputfile, encoding="gbk", index=False, header=False, mode='a+')


In [5]:
# Change Dtype to String, Drop duplicated rows
import pandas as pd
import re

data = pd.read_csv("tweets/all.csv")
data['text'] = data['text'].astype('string')
data.drop_duplicates(inplace=True)

In [6]:
# Download stopwords from NLTK
from nltk import download
download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/chenanqi/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
# Clean Data
def text_cleaner(text):
    new_string = text.lower() #lower case
    new_string = re.sub(r'\([^)]*\)', '', new_string)
    new_string = re.sub('"','', new_string)
    new_string = re.sub(r"'s\b","",new_string) # delete 's in the text
    new_string = re.sub("[^a-zA-Z]", " ", new_string) # Change punctuation into a single space
    new_string = re.sub('[m]{2,}', 'mm', new_string)
    cleaned_text = [w for w in new_string.split() if w not in stop_words]
    return cleaned_text

In [8]:
# Collect cleaned texts, save them into another column
words = []
for t in data['text']:
    word = text_cleaner(t)
    words.append(word)

In [9]:
data['cleaned_text'] = words
# data['cleaned_text'] = data['cleaned_text'].astype('string')
data['cleaned_text'][:5]

0    [rt, frank, chimienti, handfuls, biden, rally,...
1    [usually, wicker, furniture, used, outdoors, p...
2    [underthewitness, hi, city, law, amp, law, req...
3    [dawnroseturner, bonjour, unroll, smillssk, sk...
Name: cleaned_text, dtype: object

In [10]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2185 entries, 0 to 4787
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   text          2185 non-null   string
 1   cleaned_text  2185 non-null   object
dtypes: object(1), string(1)
memory usage: 51.2+ KB


In [11]:
for r in data['cleaned_text']:
    dict = {}
    for w in r:
        dict[w] = dict.get(w, 0) + 1
#     print (dict)

In [12]:
pWords = open("polarity/positive-words.txt", "r")
pWordsRead = pWords.read()
pWList = pWordsRead.split("\n")
nWords = open("polarity/negative-words.txt", "r")
nWordsRead = nWords.read()
nWList = nWordsRead.split("\n")

In [13]:
def polarity_check(row):
    dict = {}
    for w in row:
        dict[w] = dict.get(w, 0) + 1
    positive = 0
    negative = 0
    match = []
    for key in dict:
        if key in pWList:
            # count positive words
            positive += dict[key]
            match.append(key)

        elif key in nWList:
            # count negative words
            negative += dict[key]
            match.append(key)

    if negative < positive:
        polarity = "positive"
    elif negative == positive:
        polarity = "neutral"
    else:
        polarity = "negative"

    return match, polarity



In [14]:
import csv
result_csv = open("sentiment_analysis_results.csv", "w+", newline='', encoding="utf-8")
result_csv = csv.writer(result_csv, delimiter=',', quotechar='"', quoting = csv.QUOTE_MINIMAL)
result_csv.writerow(['Tweet', 'Text', 'Match', 'Polarity'])
cnt = 0
for row_text in data['cleaned_text']:
    cnt += 1
    match, polarity = polarity_check(row_text)
    result_csv.writerow([cnt, row_text, ','.join(match), polarity])
    
