In [2]:
import pandas as pd
import numpy as np
import json
import glob
import spacy

#gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from datetime import datetime
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.gensim

In [3]:
# Read the CSV file
incidents = pd.read_csv("/Users/patrickdunnington/Desktop/DS_Capstone/msds_capstone_2024/personal_notebooks/patrick_nb/incidents_clean.csv")

# Convert date column to datetime format
incidents['date'] = pd.to_datetime(incidents['date'])

# Clean dataset
for i in range(1, 565):
    incidents.loc[i, 'reportnumber'] = incidents.loc[i, 'reports'].count(",") + 1

# Clean description column
incidents['clean_description'] = incidents['description'].str.replace('ai', '').str.replace('AI', '')

# Tokenize and remove stop words
stop_words = set(stopwords.words('english'))
stemmer = SnowballStemmer('english')

def preprocess_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [stemmer.stem(token) for token in tokens if token.isalpha() and token not in stop_words]
    return ' '.join(tokens)

incidents['clean_description'] = incidents['clean_description'].apply(preprocess_text)

# Create document-term matrix
vectorizer = CountVectorizer()
incident_DTM = vectorizer.fit_transform(incidents['clean_description'])

# Perform LDA
lda = LatentDirichletAllocation(n_components=10, random_state=321)
incident_topics = lda.fit_transform(incident_DTM)

# Print top terms for each topic
feature_names = vectorizer.get_feature_names_out()
for topic_idx, topic in enumerate(lda.components_):
    top_terms = [feature_names[i] for i in topic.argsort()[:-11:-1]]
    print(f"Topic {topic_idx + 1}: {', '.join(top_terms)}")


Topic 1: report, tesla, vehicl, crash, autopilot, autonom, result, cruis, oper, alleg
Topic 2: system, alleg, algorithm, use, voic, data, driver, result, caus, ask
Topic 3: tesla, use, model, report, alleg, autopilot, generat, voic, driver, result
Topic 4: use, report, algorithm, user, generat, alleg, sexual, ad, involv, deploy
Topic 5: content, report, student, alleg, use, featur, due, video, school, moder
Topic 6: content, user, report, algorithm, facebook, alleg, googl, result, use, post
Topic 7: use, report, data, alleg, risk, incid, tool, user, predict, person
Topic 8: use, system, fals, imag, report, alleg, amazon, algorithm, generat, polic
Topic 9: alleg, algorithm, use, system, user, recognit, facial, autom, face, compani
Topic 10: alleg, report, use, violat, system, polit, tesla, video, driver, facial


In [4]:
# import nltk
# nltk.download('punkt')

import pandas as pd
import numpy as np
import json
import glob
import spacy

#Gensim
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel

from datetime import datetime
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import pyLDAvis
import pyLDAvis.gensim

# Read the CSV file
incidents = pd.read_csv("/Users/patrickdunnington/Desktop/DS_Capstone/msds_capstone_2024/personal_notebooks/patrick_nb/incidents_clean.csv")

# Convert date column to datetime format
incidents['date'] = pd.to_datetime(incidents['date'])

# Clean dataset
for i in range(1, 565):
    incidents.loc[i, 'reportnumber'] = incidents.loc[i, 'reports'].count(",") + 1

# Clean description column
incidents['clean_description'] = incidents['description'].str.replace('ai', '').str.replace('AI', '')

def lemmatization(incidents, allowed_postags=["NOUN","ADJ","VERB","ADV"]):
    nlp = spacy.load("en_core_web_sm", disable = ["parser", "ner"])
    descript_out = []
    for incident in incidents:
        doc = nlp(incident)
        new_descript = []
        for token in doc:
            if token.pos_ in allowed_postags:
                new_descript.append(token.lemma_)
        final = ' '.join(new_descript)
        descript_out.append(final)      
    return (descript_out) 

lemmatized_decript = lemmatization(incidents['clean_description'])

def gen_words(incidents):
    final = []
    for incident in incidents:
        new = gensim.utils.simple_preprocess(incident, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(lemmatized_decript)

id2word = corpora.Dictionary(data_words)

corpus = []

for incident in data_words:
    new = id2word.doc2bow(incident)
    corpus.append(new)

# print(corpus[0][0:20])

word = id2word[[0][:1][0]]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=20,
                                            random_state=100,
                                            update_every=1,
                                            chunksize=100,
                                            passes=10,
                                            alpha='auto')

pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim.prepare(lda_model, corpus, id2word, mds='mmds', R=30)
pyLDAvis.save_html(vis, '/Users/patrickdunnington/Desktop/DS_Capstone/msds_capstone_2024/personal_notebooks/patrick_nb/ldavisual.html')

