In [17]:
from textblob import TextBlob
from nltk.sentiment import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
import re
import pandas as pd
from textblob import TextBlob
import random
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import statsmodels.api as sm # anova
from statsmodels.formula.api import ols # regression 

In [18]:
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<[^>]+>|https?:\/\/\S+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word not in stop_words]

    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]

    return ' '.join(tokens)

In [19]:
f18 = pd.read_csv('fall18-c.csv')
f19 = pd.read_csv('fall19-c.csv')
f20 = pd.read_csv('fall20-c.csv')
f21 = pd.read_csv('fall21-c.csv')
f22 = pd.read_csv('fall22-c.csv')
f23 = pd.read_csv('fall23-c.csv')
dfs = [f18, f19, f20, f21, f22, f23]
years = ['2018', '2019', '2020', '2021', '2022', '2023']

for df in dfs:
    df['content'] = df['content'].astype(str).apply(clean_text)

### Year Checking

In [20]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()
def get_sentiment(text):
    tb_score = TextBlob(text).sentiment.polarity
    sia_score = sia.polarity_scores(text)['compound']
    return (tb_score + sia_score) / 2

def randomize_anova_tukey(num_iterations):
    for iteration in range(num_iterations):
        # Combine and shuffle dataframes
        combined_df = pd.concat(dfs, ignore_index=True)
        shuffled_combined_df = combined_df.sample(frac=1, random_state=random.randint(1, 10000)).reset_index(drop=True)

        # Split the shuffled dataframe back into individual dataframes
        start_idx = 0
        shuffled_dfs = []
        for df in dfs:
            end_idx = start_idx + len(df)
            shuffled_df = shuffled_combined_df.iloc[start_idx:end_idx]
            shuffled_df['sentiment_score'] = shuffled_df['content'].apply(get_sentiment)
            shuffled_dfs.append(shuffled_df)
            start_idx = end_idx

        # Combine shuffled dataframes for ANOVA and Tukey HSD test
        combined_shuffled_df = pd.concat(shuffled_dfs)
        combined_shuffled_df['year'] = combined_shuffled_df.index // (len(combined_shuffled_df) // len(years))

        # Perform ANOVA
        model = ols('sentiment_score ~ C(year)', data=combined_shuffled_df).fit()
        anova_table = sm.stats.anova_lm(model, typ=2)
        print(f"ANOVA Iteration {iteration+1}:")
        print(anova_table)

        # Perform Tukey HSD test
        tukey_results = pairwise_tukeyhsd(endog=combined_shuffled_df['sentiment_score'], 
                                          groups=combined_shuffled_df['year'], alpha=0.05)
        print(f"Tukey HSD Test Iteration {iteration+1}:")
        print(tukey_results)

# Run the function for 5 iterations
randomize_anova_tukey(5)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/naazsibia/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shuffled_df['sentiment_score'] = shuffled_df['content'].apply(get_sentiment)


ANOVA Iteration 1:
               sum_sq       df         F    PR(>F)
C(year)      0.608222      6.0  1.699318  0.116668
Residual  2768.222555  46405.0       NaN       NaN
Tukey HSD Test Iteration 1:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     0      1  -0.0037 0.9648 -0.0153 0.0079  False
     0      2  -0.0024 0.9962  -0.014 0.0091  False
     0      3   0.0022  0.998 -0.0094 0.0137  False
     0      4   0.0055 0.7975  -0.006 0.0171  False
     0      5  -0.0024 0.9967  -0.014 0.0092  False
     0      6    0.264 0.7277 -0.2453 0.7733  False
     1      2   0.0013 0.9999 -0.0103 0.0129  False
     1      3   0.0059 0.7462 -0.0057 0.0175  False
     1      4   0.0093 0.2179 -0.0023 0.0208  False
     1      5   0.0013 0.9999 -0.0102 0.0129  False
     1      6   0.2677 0.7143 -0.2415  0.777  False
     2      3   0.0046 0.9047  -0.007 0.0162  False
     2      4    0.0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shuffled_df['sentiment_score'] = shuffled_df['content'].apply(get_sentiment)


ANOVA Iteration 2:
               sum_sq       df         F    PR(>F)
C(year)      0.395982      6.0  1.106253  0.355692
Residual  2768.434795  46405.0       NaN       NaN
Tukey HSD Test Iteration 2:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     0      1   -0.004 0.9481 -0.0156 0.0075  False
     0      2   0.0044 0.9215 -0.0072  0.016  False
     0      3   0.0011    1.0 -0.0105 0.0126  False
     0      4   0.0017 0.9995 -0.0099 0.0133  False
     0      5  -0.0034 0.9785  -0.015 0.0082  False
     0      6   0.0229    1.0 -0.4864 0.5322  False
     1      2   0.0084 0.3243 -0.0031   0.02  False
     1      3   0.0051 0.8531 -0.0065 0.0167  False
     1      4   0.0057 0.7687 -0.0058 0.0173  False
     1      5   0.0007    1.0 -0.0109 0.0122  False
     1      6   0.0269    1.0 -0.4824 0.5362  False
     2      3  -0.0033 0.9795 -0.0149 0.0082  False
     2      4  -0.00

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shuffled_df['sentiment_score'] = shuffled_df['content'].apply(get_sentiment)


ANOVA Iteration 3:
               sum_sq       df         F    PR(>F)
C(year)      0.371633      6.0  1.038222  0.398013
Residual  2768.459144  46405.0       NaN       NaN
Tukey HSD Test Iteration 3:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     0      1   0.0049 0.8775 -0.0067 0.0165  False
     0      2   0.0072 0.5218 -0.0044 0.0188  False
     0      3   0.0004    1.0 -0.0111  0.012  False
     0      4   0.0025 0.9955 -0.0091 0.0141  False
     0      5   0.0045 0.9134 -0.0071 0.0161  False
     0      6   0.1901 0.9281 -0.3192 0.6994  False
     1      2   0.0023  0.997 -0.0092 0.0139  False
     1      3  -0.0044 0.9186  -0.016 0.0071  False
     1      4  -0.0024 0.9968 -0.0139 0.0092  False
     1      5  -0.0004    1.0  -0.012 0.0112  False
     1      6   0.1852 0.9362 -0.3241 0.6945  False
     2      3  -0.0068 0.5976 -0.0184 0.0048  False
     2      4  -0.00

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shuffled_df['sentiment_score'] = shuffled_df['content'].apply(get_sentiment)


ANOVA Iteration 4:
               sum_sq       df         F    PR(>F)
C(year)      0.524417      6.0  1.465131  0.185718
Residual  2768.306360  46405.0       NaN       NaN
Tukey HSD Test Iteration 4:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     0      1  -0.0046 0.9071 -0.0162  0.007  False
     0      2   0.0027 0.9939 -0.0089 0.0142  False
     0      3   0.0014 0.9998 -0.0102  0.013  False
     0      4  -0.0061 0.7114 -0.0177 0.0055  False
     0      5   -0.003 0.9884 -0.0146 0.0086  False
     0      6  -0.1657 0.9625  -0.675 0.3436  False
     1      2   0.0072 0.5193 -0.0043 0.0188  False
     1      3    0.006 0.7309 -0.0056 0.0176  False
     1      4  -0.0015 0.9997 -0.0131 0.0101  False
     1      5   0.0016 0.9997   -0.01 0.0132  False
     1      6  -0.1611 0.9673 -0.6704 0.3482  False
     2      3  -0.0013 0.9999 -0.0128 0.0103  False
     2      4  -0.00

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  shuffled_df['sentiment_score'] = shuffled_df['content'].apply(get_sentiment)


ANOVA Iteration 5:
               sum_sq       df         F    PR(>F)
C(year)      0.560429      6.0  1.565763  0.152601
Residual  2768.270348  46405.0       NaN       NaN
Tukey HSD Test Iteration 5:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
     0      1  -0.0091 0.2354 -0.0207 0.0025  False
     0      2  -0.0008    1.0 -0.0124 0.0108  False
     0      3  -0.0017 0.9996 -0.0132 0.0099  False
     0      4   0.0009    1.0 -0.0106 0.0125  False
     0      5  -0.0036 0.9692 -0.0152  0.008  False
     0      6  -0.1539  0.974 -0.6632 0.3554  False
     1      2   0.0083 0.3406 -0.0033 0.0199  False
     1      3   0.0074 0.4847 -0.0041  0.019  False
     1      4     0.01 0.1392 -0.0015 0.0216  False
     1      5   0.0055 0.8045 -0.0061 0.0171  False
     1      6  -0.1448  0.981 -0.6541 0.3645  False
     2      3  -0.0009    1.0 -0.0125 0.0107  False
     2      4   0.00

### Week Checking

In [21]:
from datetime import datetime
start_dates = {
    '2018': datetime(2018, 9, 1),
    '2019': datetime(2019, 9, 1),
    '2020': datetime(2020, 9, 1),
    '2021': datetime(2021, 9, 1),
    '2022': datetime(2022, 9, 1),
    '2023': datetime(2023, 9, 1)
}

for year, df in zip(years, dfs):
    # Ensure 'created_at' is converted to datetime and is timezone-naive
    df['created'] = pd.to_datetime(df['created']).dt.tz_localize(None)
    
    # Calculate the course week number
    df['week'] = ((df['created'] - start_dates[year]).dt.days // 7) + 1
    
    # Filter out data that is outside the 1-12 week range
    df = df[(df['week'] >= 1) & (df['week'] <= 12)]

for df in dfs:
    df['sentiment_score'] = df['content'].apply(get_sentiment)

In [22]:
import pandas as pd
import random
import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.stats.multicomp import pairwise_tukeyhsd
import seaborn as sns
import matplotlib.pyplot as plt

# Number of iterations for random shuffling
num_iterations = 5

for iteration in range(num_iterations):
    print(f"Iteration {iteration + 1}")

    shuffled_dfs = []
    for df in dfs:
        # Shuffle only the sentiment scores within the dataframe
        shuffled_scores = random.sample(list(df['sentiment_score']), len(df['sentiment_score']))
        shuffled_df = df.copy()
        shuffled_df['sentiment_score'] = shuffled_scores
        shuffled_dfs.append(shuffled_df)

    # Concatenate the shuffled dataframes for ANOVA and Tukey HSD
    all_data = pd.concat(shuffled_dfs)
    all_data['year'] = all_data.index // len(all_data) // len(dfs[0])

    # One-Way ANOVA
    model = ols('sentiment_score ~ C(week)', data=all_data).fit()
    anova_table = sm.stats.anova_lm(model, typ=2)
    print("ANOVA Results:")
    print(anova_table)

    # Tukey HSD
    tukey_results = pairwise_tukeyhsd(endog=all_data['sentiment_score'], groups=all_data['week'], alpha=0.05)
    print("Tukey HSD Results:")
    print(tukey_results)

    # Regression Analysis
    for year, df in zip(years, shuffled_dfs):
        # OLS Regression
        model = sm.OLS(df['sentiment_score'], sm.add_constant(df['week'])).fit()
        print(f"OLS Regression Results for {year}:")
        print(model.summary())

Iteration 1
ANOVA Results:
               sum_sq       df         F    PR(>F)
C(week)      2.282965     19.0  2.014887  0.005476
Residual  2766.547812  46392.0       NaN       NaN


  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


Tukey HSD Results:
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
    -1      0  -0.0604 0.9989 -0.2202  0.0994  False
    -1      1  -0.1195 0.3013 -0.2666  0.0275  False
    -1      2  -0.0834 0.8391 -0.2218  0.0551  False
    -1      3  -0.0891 0.7471 -0.2273  0.0492  False
    -1      4  -0.0859 0.7959 -0.2238   0.052  False
    -1      5  -0.0893 0.7329 -0.2266  0.0481  False
    -1      6  -0.0881  0.754 -0.2256  0.0493  False
    -1      7  -0.0835 0.8378 -0.2219   0.055  False
    -1      8  -0.0881 0.7644 -0.2266  0.0503  False
    -1      9  -0.0844 0.8191 -0.2223  0.0535  False
    -1     10   -0.091 0.7027 -0.2286  0.0465  False
    -1     11  -0.0859 0.7909 -0.2231  0.0514  False
    -1     12  -0.0734 0.9404 -0.2108  0.0641  False
    -1     13  -0.0929 0.6668 -0.2306  0.0447  False
    -1     14  -0.0796 0.8816 -0.2172   0.058  False
    -1     15  -0.0683 0.97

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


Tukey HSD Results:
Multiple Comparison of Means - Tukey HSD, FWER=0.05
group1 group2 meandiff p-adj   lower  upper  reject
---------------------------------------------------
    -1      0  -0.0491 0.9999 -0.2089 0.1107  False
    -1      1  -0.0481 0.9999 -0.1951 0.0989  False
    -1      2   -0.032    1.0 -0.1704 0.1064  False
    -1      3  -0.0324    1.0 -0.1707 0.1058  False
    -1      4  -0.0348    1.0 -0.1727 0.1031  False
    -1      5  -0.0325    1.0 -0.1699 0.1049  False
    -1      6   -0.034    1.0 -0.1715 0.1035  False
    -1      7  -0.0341    1.0 -0.1726 0.1043  False
    -1      8  -0.0412    1.0 -0.1796 0.0973  False
    -1      9  -0.0361    1.0  -0.174 0.1018  False
    -1     10  -0.0375    1.0 -0.1751 0.1001  False
    -1     11  -0.0246    1.0 -0.1619 0.1128  False
    -1     12  -0.0274    1.0 -0.1648 0.1101  False
    -1     13  -0.0282    1.0 -0.1659 0.1094  False
    -1     14  -0.0263    1.0 -0.1639 0.1112  False
    -1     15  -0.0078    1.0 -0.1474 0.1317 

  quad_r = quad(f, low, high, args=args, full_output=self.full_output,


Tukey HSD Results:
Multiple Comparison of Means - Tukey HSD, FWER=0.05 
group1 group2 meandiff p-adj   lower   upper  reject
----------------------------------------------------
    -1      0   0.0592 0.9991 -0.1006  0.2189  False
    -1      1   0.0419    1.0 -0.1051   0.189  False
    -1      2   0.0541 0.9982 -0.0843  0.1925  False
    -1      3   0.0386    1.0 -0.0996  0.1769  False
    -1      4   0.0389    1.0  -0.099  0.1768  False
    -1      5   0.0412    1.0 -0.0961  0.1786  False
    -1      6   0.0355    1.0 -0.1019   0.173  False
    -1      7   0.0449 0.9999 -0.0936  0.1833  False
    -1      8   0.0456 0.9998 -0.0928  0.1841  False
    -1      9    0.053 0.9986 -0.0849  0.1909  False
    -1     10   0.0307    1.0 -0.1069  0.1683  False
    -1     11   0.0469 0.9997 -0.0904  0.1842  False
    -1     12   0.0502 0.9993 -0.0873  0.1876  False
    -1     13   0.0479 0.9996 -0.0897  0.1856  False
    -1     14   0.0509 0.9991 -0.0867  0.1885  False
    -1     15   0.0642  0.9