In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [31]:
import os
import re
import nltk
import collections
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

In [3]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


True

In [4]:
legislation_folder = "/content/drive/MyDrive/COEN_396B/project/legislation"

legislation_data = {}
for filename in os.listdir(legislation_folder):
    if filename.endswith(".txt"):
        year = int(filename.split(".")[0])
        with open(os.path.join(legislation_folder, filename), "r") as file:
            text = file.read()
            legislation_data[year] = text

In [5]:
def clean_text(text):
    # Remove non-alphanumeric characters
    text = re.sub(r"[^a-zA-Z0-9]", " ", text)
    
    # Convert text to lowercase
    text = text.lower()
    
    # Tokenize text into individual words
    words = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    words = [word for word in words if word not in stop_words]
    
    # Lemmatize words
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join cleaned words back into a string
    cleaned_text = " ".join(words)
    
    return cleaned_text
    

In [6]:
clean_legislation_data = {}

for year, text in legislation_data.items():
    cleaned_text = clean_text(text)  
    clean_legislation_data[year] = cleaned_text

In [7]:
clean_legislation_data[1995]

'ab 59 sher solid waste facility permit enforcement revised solid waste facility permitting enforcement activity carried ciwmb lea provided imposition civil liability administratively lea ciwmb solid waste facility operator compliance permitting requirement permit term condition state minimum standard related permitting handling disposal solid waste established detailed procedure ciwmb acting enforcement agency clarified process procedure requirement designation operation evaluation lea clarified requirement operator wish change solid waste facility design operation urgency measure chapter 952 ab 381 baca solid waste diversion requirement revised definition good faith effort part criterion used ciwmb determining whether impose civil penalty local jurisdiction failure implement certain planning element include evaluation city county regional agency improved technology handling management solid waste would result specified benefit chapter 219 ab 389 cannella agriculture environmental far

In [28]:
top_words_by_year = {}
words_to_exclude = ["ab", "sb", "chapter", "bill"]

for year, text in clean_legislation_data.items():

    text = re.sub(r'\d+', '', text)
    words = text.split()
    words = [word for word in words if word not in words_to_exclude]
    word_counts = collections.Counter(words)
    sorted_words = word_counts.most_common()
    top_words = [{word, counts} for word, counts in sorted_words[:5]]
    top_words_by_year[year] = top_words

for year, top_words in top_words_by_year.items():
    print(f"Most frequent words for {year}:")
    print(top_words)

Most frequent words for 2018:
[{'calrecycle', 9}, {8, 'recycling'}, {'statute', 7}, {'food', 7}, {'waste', 6}]
Most frequent words for 2017:
[{'container', 4}, {3, 'statute'}, {'beverage', 3}, {'polyethylene', 3}, {3, 'terephthalate'}]
Most frequent words for 2016:
[{19, 'statute'}, {'waste', 17}, {10, 'requires'}, {'calrecycle', 10}, {10, 'battery'}]
Most frequent words for 2015:
[{8, 'statute'}, {'recycling', 5}, {'waste', 5}, {'wood', 5}, {'project', 4}]
Most frequent words for 2014:
[{'waste', 16}, {8, 'recycling'}, {'material', 6}, {'public', 5}, {5, 'requires'}]
Most frequent words for 2013:
[{8, 'recycling'}, {'resource', 7}, {5, 'solid'}, {'waste', 5}, {'vehicle', 4}]
Most frequent words for 2012:
[{'waste', 16}, {5, 'solid'}, {5, 'facility'}, {'container', 5}, {5, 'requirement'}]
Most frequent words for 2011:
[{11, 'recycling'}, {'waste', 10}, {'solid', 7}, {'fund', 6}, {5, 'requires'}]
Most frequent words for 2010:
[{'fund', 12}, {11, 'program'}, {'state', 7}, {'product', 6},

In [29]:
from sklearn.feature_extraction.text import TfidfVectorizer

top_words_by_year = {}
words_to_exclude = ["ab", "sb", "chapter", "bill"]

for year, text in clean_legislation_data.items():
    text = re.sub(r'\d+', '', text)
    words = text.split()
    words = [word for word in words if word not in words_to_exclude]
    document = ' '.join(words)
    documents = [document]  # Wrap the document in a list for TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = zip(feature_names, tfidf_matrix.toarray()[0])
    sorted_words = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    top_words = [{word: score} for word, score in sorted_words[:5]]
    top_words_by_year[year] = top_words

for year, top_words in top_words_by_year.items():
    print(f"Most important words for {year}:")
    print(top_words)

Most important words for 2018:
[{'calrecycle': 0.3247595264191645}, {'recycling': 0.2886751345948129}, {'food': 0.2525907427704613}, {'statute': 0.2525907427704613}, {'environmental': 0.21650635094610968}]
Most important words for 2017:
[{'container': 0.3606678538669729}, {'beverage': 0.2705008904002297}, {'polyethylene': 0.2705008904002297}, {'statute': 0.2705008904002297}, {'terephthalate': 0.2705008904002297}]
Most important words for 2016:
[{'statute': 0.35167099763694676}, {'waste': 0.3146529978856892}, {'battery': 0.18508999875628776}, {'calrecycle': 0.18508999875628776}, {'requires': 0.18508999875628776}]
Most important words for 2015:
[{'statute': 0.38989614306286347}, {'recycling': 0.24368508941428965}, {'waste': 0.24368508941428965}, {'wood': 0.24368508941428965}, {'environmental': 0.19494807153143173}]
Most important words for 2014:
[{'waste': 0.5196558419693047}, {'recycling': 0.25982792098465235}, {'material': 0.19487094073848926}, {'bag': 0.16239245061540772}, {'business'

In [47]:
top_words_by_year = {}
words_to_exclude = ["ab", "sb", "chapter", "bill", "ciwmb"]

for year, text in clean_legislation_data.items():
    text = re.sub(r'\d+', '', text)
    words = text.split()
    words = [word for word in words if word not in words_to_exclude]
    document = ' '.join(words)
    documents = [document]  # Wrap the document in a list for TF-IDF vectorizer
    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(documents)
    feature_names = vectorizer.get_feature_names_out()
    tfidf_scores = zip(feature_names, tfidf_matrix.toarray()[0])
    sorted_words = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    top_words = {word: score for word, score in sorted_words}
    top_words_by_year[year] = top_words

# Create a dataframe to store the TF-IDF scores
df = pd.DataFrame.from_dict(top_words_by_year, orient='index')
df = df.iloc[::-1, :]
# Transpose the dataframe and fill missing values with 0
df = df.T.fillna(0)
df

Unnamed: 0,1995,1996,1997,1998,1999,2000,2001,2002,2003,2004,...,2009,2013,2010,2011,2012,2014,2015,2016,2017,2018
calrecycle,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.146052,0.071338,0.102899,0.097435,0.146211,0.185090,0.090167,0.324760
recycling,0.070452,0.121018,0.096114,0.128624,0.138900,0.037932,0.137732,0.033331,0.085448,0.253045,...,0.157676,0.349482,0.146052,0.392357,0.137199,0.259828,0.243685,0.166581,0.090167,0.288675
food,0.000000,0.060509,0.000000,0.000000,0.089293,0.000000,0.000000,0.000000,0.000000,0.015815,...,0.000000,0.000000,0.000000,0.000000,0.034300,0.032478,0.048737,0.037018,0.090167,0.252591
statute,0.000000,0.000000,0.000000,0.000000,0.019843,0.018966,0.019676,0.022221,0.000000,0.015815,...,0.000000,0.043685,0.058421,0.000000,0.034300,0.000000,0.389896,0.351671,0.270501,0.252591
environmental,0.083262,0.030254,0.000000,0.000000,0.099214,0.189661,0.196760,0.199988,0.256345,0.015815,...,0.000000,0.043685,0.058421,0.035669,0.102899,0.032478,0.194948,0.092545,0.000000,0.216506
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
waive,0.006405,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
waiver,0.006405,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
weigh,0.006405,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
wishing,0.006405,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [48]:
df.to_csv('/content/drive/MyDrive/COEN_396B/project/TF-IDF_score.csv', index=True)

In [21]:
from nltk.sentiment import SentimentIntensityAnalyzer

sia = SentimentIntensityAnalyzer()
sentiment_scores = {}

for year, text in clean_legislation_data.items():
    # sentiment_scores[year] = sia.polarity_scores(text)["compound"]
    sentiment_scores[year] = sia.polarity_scores(text)
sentiment_scores

{2018: {'neg': 0.055, 'neu': 0.847, 'pos': 0.099, 'compound': 0.9081},
 2017: {'neg': 0.047, 'neu': 0.861, 'pos': 0.092, 'compound': 0.5423},
 2016: {'neg': 0.073, 'neu': 0.825, 'pos': 0.102, 'compound': 0.9217},
 2015: {'neg': 0.082, 'neu': 0.853, 'pos': 0.065, 'compound': -0.7964},
 2014: {'neg': 0.139, 'neu': 0.75, 'pos': 0.111, 'compound': -0.9657},
 2013: {'neg': 0.056, 'neu': 0.807, 'pos': 0.137, 'compound': 0.9337},
 2012: {'neg': 0.133, 'neu': 0.774, 'pos': 0.094, 'compound': -0.9694},
 2011: {'neg': 0.103, 'neu': 0.817, 'pos': 0.079, 'compound': -0.9565},
 2010: {'neg': 0.042, 'neu': 0.866, 'pos': 0.092, 'compound': 0.969},
 2009: {'neg': 0.148, 'neu': 0.765, 'pos': 0.087, 'compound': -0.9451},
 2008: {'neg': 0.098, 'neu': 0.777, 'pos': 0.125, 'compound': 0.9022},
 2007: {'neg': 0.149, 'neu': 0.675, 'pos': 0.176, 'compound': 0.6705},
 2006: {'neg': 0.112, 'neu': 0.72, 'pos': 0.168, 'compound': 0.9955},
 2005: {'neg': 0.078, 'neu': 0.856, 'pos': 0.065, 'compound': -0.9217},
 20

In [22]:
from textblob import TextBlob
sentiment_scores = {}

for year, text in clean_legislation_data.items():
    blob = TextBlob(text)
    sentiment_scores[year] = blob.sentiment.polarity
sentiment_scores

{2018: 0.058695652173913045,
 2017: 0.0035714285714285657,
 2016: 0.015308734058734065,
 2015: -0.07142857142857142,
 2014: -0.018701298701298698,
 2013: 0.0071645021645021685,
 2012: -0.0414451827242525,
 2011: -0.04465894465894467,
 2010: 0.002861952861952862,
 2009: -0.09948347107438016,
 2008: 0.0850517129928894,
 2007: 0.11777643260694105,
 2006: 0.12068452380952387,
 2005: 0.006569664902998241,
 2004: 0.013095238095238083,
 2003: 0.10880875986758334,
 2002: -0.01001940133037694,
 2001: 0.028593595159860208,
 2000: 0.026839826839826816,
 1999: 0.06948025551684088,
 1998: -0.07257142857142856,
 1997: 0.09459459459459457,
 1996: 0.023652422845971206,
 1995: 0.0541418807270543}