In [1]:
import pandas as pd
import numpy as np
import gensim
import re
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split 
from sklearn import metrics
from nltk.sentiment import SentimentIntensityAnalyzer
from matplotlib import pyplot as plt
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tag import pos_tag

import nltk
nltk.download('vader_lexicon')

# %pip install Ipython
from IPython.display import display, Markdown


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/amittaijoel/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


You don't have to manually process the files, I created a HuggingFace Dataset.
You can easily import it and convert it to pandas as below.

_Note: you may have to run `pip install datasets` first._

In [2]:
# %pip install datasets
import datasets

# for more details see https://huggingface.co/datasets/siavava/ai-tech-articles
dataset = datasets.load_dataset("siavava/ai-tech-articles")
df = dataset["train"].to_pandas()  # there's only the 'train' split because HF requires it.
# display(df.head(5))

# split df by year
years = df["year"].unique()
dfs = {}
for year in years:
    dfs[year] = df[df["year"] == year]
    
    # limit to 5 (just for testing)
    dfs[year] = dfs[year].head(5)
    
for year in sorted(years, reverse=True):
    display(Markdown(f"### Year: {year}"))
    display(Markdown(f"Number of articles: {len(dfs[year])}"))
    display(dfs[year].head(3))
    display(Markdown("---"))
    


### Year: 2023

Number of articles: 5

Unnamed: 0,id,year,title,url,text
0,0,2023,"""MIT Technology Review""","""https://www.technologyreview.com""","""Featured Topics Newsletters Events Podcasts F..."
1,1,2023,"""WIRED - The Latest in Technology, Science, Cu...","""https://www.wired.com""","""Open Navigation Menu To revisit this article,..."
3,3,2023,"""TechCrunch | Startup and Technology News""","""https://www.techcrunch.com""","""WeWork reportedly on the verge of filing for ..."


---

### Year: 2022

Number of articles: 5

Unnamed: 0,id,year,title,url,text
4,4,2022,"""A new vision of artificial intelligence for t...","""https://www.technologyreview.com/2022/04/22/1...","""Featured Topics Newsletters Events Podcasts A..."
5,5,2022,"""The scientist who co-created CRISPR isn’t rul...","""https://www.technologyreview.com/2022/04/26/1...","""Featured Topics Newsletters Events Podcasts F..."
6,6,2022,"""These fast, cheap tests could help us coexist...","""https://www.technologyreview.com/2022/04/27/1...","""Featured Topics Newsletters Events Podcasts F..."


---

### Year: 2021

Number of articles: 5

Unnamed: 0,id,year,title,url,text
42,44,2021,"""He got Facebook hooked on AI. Now he can't fi...","""https://www.technologyreview.com/2021/03/11/1...","""Featured Topics Newsletters Events Podcasts H..."
51,54,2021,"""The race to understand the thrilling, dangero...","""https://www.technologyreview.com/2021/05/20/1...","""Featured Topics Newsletters Events Podcasts F..."
53,56,2021,"""Wearable materials with embedded synthetic bi...","""https://www.nature.com/articles/s41587-021-00...","""Thank you for visiting nature.com. You are us..."


---

### Year: 2020

Number of articles: 5

Unnamed: 0,id,year,title,url,text
20,21,2020,"""Privacy Policy - Singularity""","""https://su.org/privacy-policy""","""Executive Program Custom Programs Resources I..."
38,40,2020,"""Algorithmic Colonization of Africa – SCRIPTed""","""https://script-ed.org/article/algorithmic-col...","""A Journal of Law, Technology & Society https:..."
39,41,2020,"""The messy, secretive reality behind OpenAI’s ...","""https://www.technologyreview.com/2020/02/17/8...","""Featured Topics Newsletters Events Podcasts F..."


---

### Year: 2019

Number of articles: 5

Unnamed: 0,id,year,title,url,text
2,2,2019,"""The Verge""","""https://www.theverge.com""","""The Verge homepage The Verge The Verge logo.\..."
8,8,2019,"""About - Google DeepMind""","""https://www.deepmind.com/about""","""DeepMind Search Search Close DeepMind About O..."
71,76,2019,"""AlphaStar: Mastering the real-time strategy g...","""https://deepmind.com/blog/article/alphastar-m...","""DeepMind Search Search Close DeepMind About O..."


---

### Year: 2018

Number of articles: 5

Unnamed: 0,id,year,title,url,text
68,72,2018,"""Scalable agent architecture for distributed t...","""https://deepmind.com/blog/article/impala-scal...","""DeepMind Search Search Close DeepMind About O..."
70,74,2018,"""DeepMind, meet Android - Google DeepMind""","""https://deepmind.com/blog/announcements/deepm...","""DeepMind Search Search Close DeepMind About O..."
77,82,2018,"""AlphaZero: Shedding new light on chess, shogi...","""https://deepmind.com/blog/article/alphazero-s...","""DeepMind Search Search Close DeepMind About O..."


---

### Year: 2017

Number of articles: 5

Unnamed: 0,id,year,title,url,text
67,71,2017,"""Indaba - Deep Learning Indaba 2023""","""http://www.deeplearningindaba.com""","""Indaba Organisers Affiliated Communities Spon..."
97,107,2017,"""Protein - Wikipedia""","""https://en.wikipedia.org/wiki/Protein""","""Main menu Main page Contents Current events R..."
98,109,2017,"""AlphaGo Zero: Starting from scratch - Google ...","""https://deepmind.com/blog/article/alphago-zer...","""DeepMind Search Search Close DeepMind About O..."


---

### Year: 2016

Number of articles: 5

Unnamed: 0,id,year,title,url,text
15,15,2016,"""AlphaGo - Google DeepMind""","""https://www.deepmind.com/research/highlighted...","""DeepMind Search Search Close DeepMind About O..."
49,52,2016,"""INDIGENOUS AI — PEOPLE""","""https://www.indigenous-ai.net/people""","""INDIGENOUS AI POSITION PAPER WORKSHOPS BLOG A..."
73,78,2016,"""AlphaGo - Google DeepMind""","""https://deepmind.com/research/case-studies/al...","""DeepMind Search Search Close DeepMind About O..."


---

### Year: 2015

Number of articles: 5

Unnamed: 0,id,year,title,url,text
297,310,2015,"""The Plot to Free North Korea With Smuggled Ep...","""https://www.wired.com/2015/03/north-korea""","""Open Navigation Menu To revist this article, ..."
325,339,2015,"""Commonsense reasoning and commonsense knowled...","""https://dl.acm.org/doi/10.1145/2701413""","""Dartmouth College Advanced Search Browse Abou..."
328,342,2015,"""The Philosopher of Doomsday | The New Yorker""","""https://www.newyorker.com/magazine/2015/11/23...","""A Reporter at Large The Doomsday Invention Wi..."


---

### Year: 2014

Number of articles: 5

Unnamed: 0,id,year,title,url,text
617,638,2014,"""Jiwei Li | MIT Technology Review""","""https://www.technologyreview.com/innovator/ji...","""Featured Topics Newsletters Events Podcasts F..."
754,782,2014,"""OII | People""","""https://www.oii.ox.ac.uk/people/dphil-students""","""Research Research Section Home Research Areas..."
798,829,2014,"""Foundations""","""https://www.iter.org/construction/tkmfoundati...","""Jobs 7 Tenders 78 Faqs Visits Contact us ITER..."


---

### Year: 2013

Number of articles: 5

Unnamed: 0,id,year,title,url,text
311,325,2013,"""Chill: Robots Won't Take All Our Jobs | WIRED""","""https://www.wired.com/2017/08/robots-will-not...","""WIRED Logo The Great Tech Panic: Robots Won’t..."
669,691,2013,"""Tech Policy Lab | University of Washington""","""https://techpolicylab.uw.edu""","""Tech Policy Lab University of Washington Join..."
2104,2159,2013,"""FDA Approves Eye Implant Enabling The Blind T...","""https://singularityhub.com/2013/02/19/fda-app...","""Topics AI Biotech Computing Space Energy Futu..."


---

### Year: 2012

Number of articles: 5

Unnamed: 0,id,year,title,url,text
52,55,2012,"""Rachel Haurwitz | MIT Technology Review""","""https://www.technologyreview.com/innovator/ra...","""Featured Topics Newsletters Events Podcasts F..."
985,1017,2012,"""Fiji: an open-source platform for biological-...","""https://doi.org/10.1038%2Fnmeth.2019""","""Thank you for visiting nature.com. You are us..."
1150,1191,2012,"""Department Of Computer Science | Makerere Uni...","""https://cs.mak.ac.ug/people/faculty""","""Makerere University Department of Computer Sc..."


---

### Year: 2011

Number of articles: 4

Unnamed: 0,id,year,title,url,text
730,756,2011,"""Stochastic gradient descent - Wikipedia""","""https://en.wikipedia.org/wiki/Stochastic_grad...","""Main menu Main page Contents Current events R..."
1116,1149,2011,"""IBM Blue Gene - Wikipedia""","""https://en.wikipedia.org/wiki/IBM_Blue_Gene""","""Main menu Main page Contents Current events R..."
2027,2082,2011,"""David J. Hill, Author at Singularity Hub""","""https://singularityhub.com/author/dhill""","""Topics AI Biotech Computing Space Energy Futu..."


---

### Year: 2010

Number of articles: 3

Unnamed: 0,id,year,title,url,text
401,421,2010,"""Synthetic biology: applications come of age -...","""http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...","""An official website of the United States gove..."
408,428,2010,"""Synthetic biology: applications come of age |...","""https://doi.org/10.1038%2Fnrg2775""","""Thank you for visiting nature.com. You are us..."
1606,1655,2010,"""REBUS and the Anarchic Brain: Toward a Unifie...","""https://doi.org/10.1124/pr.118.017160""","""Main menu Home Articles Current Issue Fast Fo..."


---

### Year: 2009

Number of articles: 5

Unnamed: 0,id,year,title,url,text
688,710,2009,"""Stroop and picture—word interference are two ...","""https://doi.org/10.3758%2FPBR.16.6.987""","""Advertisement Log in Menu Search Cart Home Ps..."
696,719,2009,"""""Computational and Biological Learning Lab, C...","""http://www.cs.nyu.edu/~yann/index""","""CBLL HOME VLG Group News/Events Seminars Peop..."
698,721,2009,"""""CBLL, Research Projects, Computational and B...","""http://www.cs.nyu.edu/~yann/research/lagr""","""CBLL HOME VLG Group News/Events Seminars Peop..."


---

### Year: 2008

Number of articles: 5

Unnamed: 0,id,year,title,url,text
265,277,2008,"""Abundance360 by Peter Diamandis""","""https://www.abundance360.com""","""Abundance360 Applications Are Now Open by Sin..."
400,420,2008,"""Diversity and Evolution of Coral Fluorescent ...","""http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2...","""An official website of the United States gove..."
775,804,2008,"""Stockfish (chess) - Wikipedia""","""https://en.wikipedia.org/wiki/Stockfish_(chess)""","""Main menu Main page Contents Current events R..."


---

### Year: 2007

Number of articles: 3

Unnamed: 0,id,year,title,url,text
699,723,2007,"""""CBLL, Research Projects, Computational and B...","""http://www.cs.nyu.edu/~yann/research/relreg""","""CBLL HOME VLG Group News/Events Seminars Peop..."
1120,1153,2007,"""Discovering DNA: Friedrich Miescher and the e...","""https://doi.org/10.1007%2Fs00439-007-0433-0""","""Advertisement Log in Menu Search Cart Home Hu..."
1123,1157,2007,"""Identification and analysis of functional ele...","""https://doi.org/10.1038%2Fnature05874""","""Thank you for visiting nature.com. You are us..."


---

### Year: 2006

Number of articles: 4

Unnamed: 0,id,year,title,url,text
697,720,2006,"""""CBLL, Research Projects, Computational and B...","""http://www.cs.nyu.edu/~yann/research""","""CBLL HOME VLG Group News/Events Seminars Peop..."
897,928,2006,"""Project MUSE - The History of Ideas: Precept ...","""https://doi.org/10.1353%2Fjhi.2006.0006""","""Access provided by Dartmouth College Library ..."
1124,1158,2006,"""An integrated view of protein evolution | Nat...","""https://doi.org/10.1038%2Fnrg1838""","""Thank you for visiting nature.com. You are us..."


---

### Year: 2005

Number of articles: 5

Unnamed: 0,id,year,title,url,text
902,934,2005,"""自然語言處理 - 维基百科，自由的百科全书""","""https://zh.wikipedia.org/wiki/%E8%87%AA%E7%84...","""主菜单 首页 分类索引 特色内容 新闻动态 最近更改 随机条目 资助维基百科 帮助 维基社..."
903,935,2005,"""ISOTOPES IN MARINE SEDIMENTS | SpringerLink""","""https://doi.org/10.1007%2F1-4020-2504-1_06""","""Advertisement Log in Menu Search Cart Isotope..."
2014,2069,2005,"""How Quantum Computers Can Be Used to Build Be...","""https://blubrry.com/singularityhub/81633365/h...","""Podcast Hosting Hosting Plans Easy-to-use too..."


---

### Year: 2004

Number of articles: 1

Unnamed: 0,id,year,title,url,text
2298,2362,2004,"""Halo 2 | Halo Alpha | Fandom""","""https://halo.fandom.com/wiki/Halo_2""","""Explore Main Page Discuss All Pages Community..."


---

### Year: 2003

Number of articles: 1

Unnamed: 0,id,year,title,url,text
2266,2326,2003,"""Introduction to the Old English poem called B...","""https://www.heorot.dk/beowulf-vorwort""","""An Introduction to the Structure & Making of ..."


---

### Year: 2001

Number of articles: 3

Unnamed: 0,id,year,title,url,text
1122,1155,2001,"""Chromosome territories, nuclear architecture ...","""https://doi.org/10.1038%2F35066075""","""Thank you for visiting nature.com. You are us..."
1142,1183,2001,"""Department Of Computer Science | Makerere Uni...","""https://cs.mak.ac.ug/curriculum""","""Makerere University Department of Computer Sc..."
1143,1184,2001,"""Department Of Computer Science | Makerere Uni...","""https://cs.mak.ac.ug/curriculum/undergraduate...","""Makerere University Department of Computer Sc..."


---

The data is separated by year. I will be starting from 2000 and I removed the unknown sub-folder within the data folder Each of the folders will be converted to dataframes that correspond to a specific year with rows that represent each of the files.

Each of the files in the folders will be first go through a content check (if the document is too short (empty) or too long (over 1000 lines) it will be discarded).

If the file passes the length check it will then be shortened to remove the header and have its contents added to the dataframe.

This dataframe will then be added to a dictionary that maps years to the corresponding dataframe. 


> I think this is good filtering. You can easily do this in the dfs generated above.

In [3]:
# def file_process(filename):
    
#     file = open(filename, "r")
#     num_lines = len(file.readlines())
    
#     if num_lines < 15 or num_lines > 400:
#         file.close()
#         return None
    
#     else:
#         file.seek(0) 
#         content = file.readlines()[5:]

#     text = " ".join(content)
#     file.close()
    
#     return text


In [4]:
#print(file_process("../data/categorized/2023/440.txt"))


In [5]:
# def create_dataframe(foldername):

#     folder = Path(foldername)
#     df = pd.DataFrame(columns=['ID', 'Text'])
#     index = 0

#     for filename in sorted(folder.iterdir()):
#         if filename.is_file() == True:
#             text = file_process(filename)
#             if text != None:
#                 id = re.search(r'(\d+)\.txt$', str(filename))
#                 id = int(id.group(1))
#                 df.loc[index] = [str(id),text]
#                 index = index + 1
    
#     return df


In [6]:
#display(create_dataframe('../data/categorized/2013'))


In [7]:
# def create_dictionary(data_folder):
#     year_dict = {}
#     folder = Path(data_folder)
#     begin = False

#     for year_folder in sorted(folder.iterdir()):
        
#         if year_folder.is_dir() == True:
#             year_key = re.search(r'categorized/(\d+)', str(year_folder))
#             year_key = int(year_key.group(1))
            
#             if year_key == 2000:
#                 begin = True
        
#             if begin == True:
#                 year_df = create_dataframe(year_folder)

#                 if year_df.empty == False:
#                     year_dict[year_key] = year_df
    
#     return year_dict


In [8]:
# year_dictionary = create_dictionary('../data/categorized')

year_dictionary = dfs


Now that we have the year dictionary we can begin the process of sentiment analysis.

The sentiment analysis model will return a sentiment number for each file. This number will then be averaged across all the files to get an average sentiment for the year.

From there we can do cosine similarity and plot sentiment for each year.

In [9]:
def add_sentiment(dataframe):

    sentiment_model = SentimentIntensityAnalyzer()
    neg_list = []
    neu_list = []
    pos_list = []
    comp_list = []


    for text in dataframe['text']:
        score_dict = sentiment_model.polarity_scores(text)
        neg_list.append(score_dict['neg'])
        neu_list.append(score_dict['neu'])
        pos_list.append(score_dict['pos'])
        comp_list.append(score_dict['compound'])
    
    dataframe['Negative'] = neg_list
    dataframe['Neutral'] =  neu_list
    dataframe['Positive'] = pos_list
    dataframe['Compound'] = comp_list


In [10]:
# text = ["CS89 is an amazing class!"]

# df = pd.DataFrame({'Text': text})

# add_sentiment(df)
# display(df)


In [11]:
for df in year_dictionary.values():
    add_sentiment(df)


In [12]:
def get_sentiment(df, sentiment_type):
    return np.mean(df[sentiment_type])


In [13]:
def get_sentiment_data(year_dict):
    
    sentiment_types = ['Negative', 'Neutral', 'Positive', 'Compound']
    sentiment_df = pd.DataFrame(columns=['Year', 'Negative', 'Neutral', 'Positive', 'Compound'])
    index = 0

    for year, df in year_dict.items():
        data_list = [int(year)]

        for s_type in sentiment_types:
            data_list.append(get_sentiment(df,s_type))
            
        sentiment_df.loc[index] = data_list
        index = index + 1

    return sentiment_df


In [14]:
sentiment_data_df = get_sentiment_data(year_dictionary)
display(sentiment_data_df)


Unnamed: 0,Year,Negative,Neutral,Positive,Compound
0,2023.0,0.0538,0.7938,0.1526,0.99708
1,2019.0,0.0296,0.8466,0.1236,0.9978
2,2022.0,0.029,0.8532,0.1178,0.99926
3,2016.0,0.0284,0.8348,0.1368,0.9984
4,2020.0,0.0624,0.8376,0.1,0.60686
5,2021.0,0.0538,0.8486,0.0974,0.60674
6,2012.0,0.031,0.8702,0.0988,0.59742
7,2017.0,0.0218,0.8612,0.1168,0.99866
8,2018.0,0.0172,0.838,0.1448,0.99828
9,2008.0,0.0228,0.8656,0.1116,0.99808
