In [None]:
import nltk
import matplotlib.pyplot as plt
#nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import bs4 as bs
import numpy as np
import pandas as pd
import requests
from pandas import DataFrame
import csv
from datetime import datetime
import time
import pathlib

sid = SentimentIntensityAnalyzer()

#user input
url_link=input("Enter URL: ").rstrip()
file_name=input("Name the file: ").rstrip() 

#create the csv with headers if csv does not already exist
csv_file_path = pathlib.Path(file_name +'.csv')
if csv_file_path.exists():
    input("This csv already exists. Press Enter to append existing csv.")
else:
    csv_header_df = pd.DataFrame(columns = ['Name', 'Date', 'Time', 'Score', 'Reply'])
    csv_header_df.to_csv(csv_file_path, index = False, mode = "a", header=True, encoding='utf-8-sig')


df2 = pd.DataFrame()

forum_thread_page_num = 1

while True:
    time.sleep(1)
    url = url_link + 'page-' + str(forum_thread_page_num)
    print(forum_thread_page_num)
    request = requests.get(url)
    response = request.text 
    soup = bs.BeautifulSoup(response, 'lxml')

    #create list of usernames so that they can later be matched to comments 
    username_list = []
    username = soup.findAll("a", {"itemprop": "name"})
    for name in username:
        new_name = name.get_text()
        username_list.append(new_name)
        
    #find the date & time
    date_soup = soup.findAll('div', {'class': 'message-attribution-main'})
    date_list = []
    time_list = []
    #date and time is found in the '<time>' tag. In the time tage there is a variable called "datetime" that is = to 
    #date of the post and formatted as '2020-09-24T18:28:56-0400', I store this datetime as a list object and slice it
    #so that I get only '2020-09-24' for date and 18:28:56 for time.
    for item in date_soup:
        date = item.find('time').attrs['datetime'][0:10]
        time_ = item.find('time').attrs['datetime'][11:19]
        date_list.append(date)
        time_list.append(time_)

    #find the comments 
    comments = soup.findAll("div", {"class": "bbWrapper"})

    #return only the text from the comments. 'recursive=False' prevents parsing any sub-tags. All needed text is a 
    #direct child of -> "div", {"class": "bbWrapper"}
    comment_list = []
    for comment in comments:
        comment=comment.find_all(text=True, recursive=False)
        comment = ''.join(comment) #convert list to string
        comment = comment.replace('\n', '') #remove new lines for paragraphs(Combines multiple paragraphs to one)
        if not comment: #if comment is empty
            comment = 'N/A'
        comment_list.append(comment)
    
    #use sentimentAnalyzer for each comment and create list of the 'compound' score for each comment
    compound_result_list = []
    for comment in comment_list:
        sentiment_result_dict = sid.polarity_scores(comment)
        compound_result = sentiment_result_dict.get('compound')
        compound_result_list.append(compound_result)
    
    #Iterate through the compound_result list to determine whether each score is pos. neut. or neg. Then append this
    #to the sentiment list
    sentiment_list = []
    for score in compound_result_list:
        if score <= -0.05:
            sentiment = "Negative"
        elif score > -0.05 and score < 0.05:
            sentiment = "Neutural"
        elif score >= 0.05:
            sentiment = "Positive"
        sentiment_list.append(sentiment)

    #-1 to -0.0500. The second bin is -0.0499 to 0.0499 The third bin is 0.0500 to 1
    
    #combine five lists and convert to DataFrame
    new_dict = zip(username_list, date_list, time_list, compound_result_list, sentiment_list, comment_list)
    df = DataFrame(new_dict)
    df2 = df2.append(df)
    
    #if there are less than 50 usernames it means it is the last page and should break
    if len(username_list) < 50:
        #Rename the columns of the dataframe
        df2.rename(columns={0: "Username", 1: "Date", 2: "Time", 3: "Score", 4:"Sentiment", 5: "Replies"}, inplace=True)
        #Convert date & time columns from string to datetime objects so that they can be manipulated with pandas
        df2["Date"] = pd.to_datetime(df2["Date"], format="%Y-%m-%d")
        df2["Time"] = pd.to_datetime(df2["Time"], format="%H:%M:%S").dt.time

        print('*FINISHED*')
        
        #####Total Replies by a user#####
        #Use groupby to group by Username, then use .size() to return a series that will show
        #the total number for each username. Convert this to a dataframe using .reset_index()
        df3 = df2.groupby(["Username"]).size().reset_index(name='Total Replies') 
        #Sort the dataframe users with least replies to greatest replies then get the tail which will have users with the most posts. 
        #Do this because if the bar chart goes from gretest to least, the bar chart will be upside down.
        df3.sort_values(by=['Total Replies'], inplace=True, ascending=True)
        df3=df3.tail(10)
        username_labels = df3["Username"]
        total_replies_label = df3["Total Replies"]
        plt.figure(figsize=(15,10))
        plt.barh(username_labels, total_replies_label)
        plt.show()
        ###########
        
        
        #####Total Replies on a date in Chron order#####
        #Group dates of the dataframe using groupby then find the total number of dates for each date using Grouper which will return a series. 
        #this series to a dataframe using .reset_index()
        df4 = df2.set_index("Date").groupby(pd.Grouper(freq='D')).size().reset_index(name='Total Replies')
        date_labels = df4["Date"]
        total_replies_label=df4["Total Replies"]
        plt.figure(figsize=(15,10))
        plt.bar(date_labels, total_replies_label)
        plt.xticks(rotation=45)
        plt.show()
        ###########
        
        #####Sentiment neg, neut, pos#########
        #groupby the sentiment column (pos. neg. neut.) add up each a create the total sentiment column
        df5 = df2.groupby(["Sentiment"]).size().reset_index(name='Total Sentiment') 
        #Visualize Sentiment in pie chart
        total_sentiment = df5["Total Sentiment"]
        sentiment_labels = ["Negative", "Neutural", "Positive"]
        plt.pie(total_sentiment, labels = sentiment_labels, shadow = True, startangle = 90, autopct='%1.1f%%', radius = 3)
        plt.show()
        display(df5)
        ##########
         
        #Write dataFrame to csv in append mode with the header removed
        df2.to_csv(csv_file_path, index = False, mode = "a", header=False, encoding='utf-8-sig')

        break
    else:
        forum_thread_page_num = forum_thread_page_num + 1

#positive sentiment: compound score >= 0.05
#neutral sentiment: (compound score > -0.05) and (compound score < 0.05)
#negative sentiment: compound score <= -0.05