This notebook gives the count comparison of submissions and/or comments for INDIVIDUAL users (in r/stopsmoking and r/stopdrinking) BEFORE and DURING COVID-19 pandemic. 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as md
import seaborn as sns
import datetime as dt
import time
import datetime
from textblob import TextBlob
import nltk
from nltk import ngrams
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
# nltk.download('punkt')
import operator
import re
import glob

In [52]:
def getStatsDict(all_files):
    stats_dict = dict() # this will keep record of username as the key and total of number of comments, number comments
    #before Covid and number of comments during Covid

    # Will do a for loop that goes through each file once and get the stats on number of comments/posts
    for filename in all_files:
        username = filename.split('/')[-1].split('.')[0]
        extension = filename.split('/')[-1].split('.')[1]
        if extension == 'tsv':
            df_temp = pd.read_csv(filename, sep='\t')
        else:
            df_temp = pd.read_csv(filename, sep=',') # for csv files
        total_coms = df_temp.shape[0]
        # spliting based on Covid official pandemic date
        try:
            b_df_temp = df_temp.loc[df_temp['created_utc'] < datetime.datetime(2020, 3, 11, 0, 0).timestamp()]
            a_df_temp = df_temp.loc[df_temp['created_utc'] > datetime.datetime(2020, 3, 10, 0, 0).timestamp()]
        except TypeError:    
            utc_lst = df_temp['created_utc'].tolist()
            ind_lst = [] # list of indices in the data frame that is causing problems
            for i in range(df_temp.shape[0]):
                try:
                    temp = float(utc_lst[0])
                except ValueError:
                    ind_lst.append(i)
            df_temp.drop(df_temp.index[ind_lst], inplace=True)
            b_df_temp = df_temp.loc[df_temp['created_utc'] < datetime.datetime(2020, 3, 11, 0, 0).timestamp()]
            a_df_temp = df_temp.loc[df_temp['created_utc'] > datetime.datetime(2020, 3, 10, 0, 0).timestamp()]
        b_coms = b_df_temp.shape[0]
        a_coms = a_df_temp.shape[0]
        stats_dict[username] = [total_coms, b_coms, a_coms]
    return stats_dict

In [13]:
# This notebook will contain the code for the analysis of reddit posts by individual reddit users.

# Working on Smoking data set (comments)

path_smokeC = "/Users/nayza/Downloads/comments_threads/smokingComments_threads/"
smokeC_files = glob.glob(path_smokeC + "*.tsv") # glob allows one to import all files with the same file extension at once
statsDict_smokeC = getStatsDict(smokeC_files)

In [15]:
# continuation of the previous cell; for Smoking (comments)
df_smokeC = pd.DataFrame.from_dict(statsDict_smokeC, orient='index', columns=['Total comments', 'Before Covid',
                                                                      'During Covid'])

In [16]:
dfSmokeC = df_smokeC.loc[(df_smokeC['Before Covid'] > 0) & (df_smokeC['During Covid'] > 0)]


In [17]:
dfSmokeC.shape[0]

5129

In [45]:
# Now, will do the same thing for Smoking (sumbmissions)

path_smokeS = "/Users/nayza/Downloads/Submissions_threads/smoking_threads/"
smokeS_files = glob.glob(path_smokeS + "*.tsv")
statsDict_smokeS = getStatsDict(smokeS_files)

df_smokeS = pd.DataFrame.from_dict(statsDict_smokeS, orient='index', columns=['Total submissions', 'Before Covid',
                                                                      'During Covid'])

dfSmokeS = df_smokeS.loc[(df_smokeS['Before Covid'] > 0) & (df_smokeS['During Covid'] > 0)]

In [46]:
dfSmokeS.shape[0]

821

In [31]:
# Lastly, Drinking data set.
# Note that both submissions and comments files of the drinking data set are stored in one folder (one path). Will 
# separate them out in the next cell.

path_drinkCS = "/Users/nayza/Downloads/stopdrinking_authors/"
drinkCS_files = glob.glob(path_drinkCS + "*.csv")

In [41]:
# Separating out drinkCS_files into comments and submissions.

#drinkCS_files[0].split('/')[-1]
drinkC_files, drinkS_files = [], []
notCorS_files = [] # if the filename has neither 'comments' or 'submissions' in it, filepath will be appended to here.
for filepath in drinkCS_files:
    filename = filepath.split('/')[-1]
    if re.search('comments', filename):
        drinkC_files.append(filepath)
    elif re.search('submissions', filename):
        drinkS_files.append(filepath)
    else:
        notCorS_files.append(filepath)
# notCorS_files is empty!

In [53]:
# Now, we repeat what we did with smokeC and smokeS here with drinkC and drinkS.

# first drinkC
statsDict_drinkC = getStatsDict(drinkC_files)
df_drinkC = pd.DataFrame.from_dict(statsDict_drinkC, orient='index', columns=['Total comments', 'Before Covid',
                                                                      'During Covid'])
dfDrinkC = df_drinkC.loc[(df_drinkC['Before Covid'] > 0) & (df_drinkC['During Covid'] > 0)]

In [56]:
# Lastly, drinkS.
statsDict_drinkS = getStatsDict(drinkS_files)
df_drinkS = pd.DataFrame.from_dict(statsDict_drinkS, orient='index', columns=['Total submissions', 'Before Covid',
                                                                      'During Covid'])
dfDrinkS = df_drinkS.loc[(df_drinkS['Before Covid'] > 0) & (df_drinkS['During Covid'] > 0)]

In [58]:
# Now, exporting the results as CSV files

dfSmokeC.to_csv("/Users/nayza/Desktop/YTproject/smokeComments_results.csv")

In [59]:
dfSmokeS.to_csv("/Users/nayza/Desktop/YTproject/smokeSubmissions_results.csv")

In [60]:
dfDrinkC.to_csv("/Users/nayza/Desktop/YTproject/drinkComments_results.csv")
dfDrinkS.to_csv("/Users/nayza/Desktop/YTproject/drinkSubmissions_results.csv")