In [5]:
import os
import pandas as pd
from datetime import datetime, timedelta
from gensim import corpora, models
import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from nltk.stem import WordNetLemmatizer
import nltk


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nicolasmalz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [52]:
directory = 'output_texts/'

texts = []


# Loop through the file range
for i in range(1, 500):
    file_path = os.path.join(directory, f'{i}.txt')

    # Check if the file exists to avoid errors
    if os.path.exists(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            texts.append(file.read())
    else:
        print(f"File {i}.txt not found.")

File 2.txt not found.
File 3.txt not found.
File 4.txt not found.
File 5.txt not found.
File 205.txt not found.
File 217.txt not found.
File 243.txt not found.
File 313.txt not found.
File 332.txt not found.
File 456.txt not found.
File 485.txt not found.
File 486.txt not found.
File 487.txt not found.
File 488.txt not found.
File 489.txt not found.
File 490.txt not found.
File 491.txt not found.
File 492.txt not found.
File 493.txt not found.
File 494.txt not found.
File 495.txt not found.
File 496.txt not found.
File 497.txt not found.
File 498.txt not found.
File 499.txt not found.


In [53]:


nltk.download('stopwords')
nltk.download('wordnet')

# Define a preprocessing function
def preprocess(text):
    # Lowercasing
    
    text = text.lower()
    text = re.sub(r'http[s]?://\S+', '', text)
    # Remove special characters
    text = re.sub(r'\W', ' ', text)
    # Remove single characters
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

    text = re.sub('coalwire', '', text)
    text = re.sub('listupdate', '', text)
    text = re.sub('subscription', '', text)
    text = re.sub('toeditor', '', text)
    
    # Lemmatize text
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stopwords.words('english')]
    return ' '.join(words)

# Apply preprocessing to each newsletter
cleaned_texts = [preprocess(text) for text in texts]

# Feature Extraction with TF-IDF
vectorizer = TfidfVectorizer(max_df=0.7, min_df=10, ngram_range=(1, 3))
tfidf_matrix = vectorizer.fit_transform(cleaned_texts)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/nicolasmalz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/nicolasmalz/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [54]:
# Determine the number of topics
n_topics = 10  # Adjust based on experimentation

lda = LatentDirichletAllocation(n_components=n_topics, max_iter=10, learning_method='online')
lda.fit(tfidf_matrix)

# Function to display topics
def display_topics(model, feature_names, no_top_words):
    for topic_idx, topic in enumerate(model.components_):
        print(f"Topic {topic_idx}:")
        print(" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words - 1:-1]]))

no_top_words = 10
display_topics(lda, vectorizer.get_feature_names_out(), no_top_words)


Topic 0:
coal ash adani 2015 suggested tweet lobby dumping 2014 ash tweet proposed plant
Topic 1:
tweet 2016 import solar 2018 proposal domestic peabody trump analysis
Topic 2:
adani lignite suggested terminal 2015 tweet 2017 2013 permit email
Topic 3:
investigation 2014 eskom 2012 standard chartered world bank uk underground coal gasification dumping death
Topic 4:
suggested 2014 terminal suggested tweet tamil close coal import tweet 39 per cent coal industry
Topic 5:
tweet adani coal industry suggested tweet 2014 pakistan spill 2016 pdf coal india 2016
Topic 6:
adani unit eskom steel 2022 solar import 2030 2021 ash
Topic 7:
day suggested tweet coal gasification plant 2014 know reach agreement going quality movement 500 mw
Topic 8:
cc dam tweet billiton terminal 2014 bhp ash coal fired bhp billiton
Topic 9:
forced opposition proposed tell poor back away 2015 assumes challenge proposed mercury air south african


In [55]:
import pandas as pd
import numpy as np

# Assuming you have 500 newsletters
num_newsletters = len(texts)

# Generate dates starting from August 29, 2013, one per week
start_date = '2013-08-29'
dates = pd.date_range(start=start_date, periods=num_newsletters, freq='7D')

# You already have the dominant_topics from the LDA model
dominant_topics = np.argmax(lda.transform(tfidf_matrix), axis=1)

# Create a DataFrame for analysis
df = pd.DataFrame({'date': dates, 'dominant_topic': dominant_topics})
df['year'] = df['date'].dt.year

# Group by year and dominant topic to see trends
trend_analysis = df.groupby(['year', 'dominant_topic']).size().unstack().fillna(0)
print(trend_analysis)


dominant_topic   6
year              
2013            18
2014            52
2015            53
2016            52
2017            52
2018            52
2019            52
2020            53
2021            52
2022            38


In [56]:
from textblob import TextBlob

# Function to calculate sentiment
def calculate_sentiment(text):
    return TextBlob(text).sentiment.polarity

# Calculate sentiment for each newsletter
df['sentiment'] = [calculate_sentiment(text) for text in cleaned_texts]

# Analyze sentiment trends
sentiment_trends = df.groupby('year')['sentiment'].mean()
print(sentiment_trends)


year
2013    0.038835
2014    0.039822
2015    0.038121
2016    0.046177
2017    0.046940
2018    0.046014
2019    0.047068
2020    0.045501
2021    0.043501
2022    0.048209
Name: sentiment, dtype: float64


In [57]:
import networkx as nx
import itertools

# Build a co-occurrence matrix for topics
topic_combinations = itertools.combinations(range(n_topics), 2)
co_occurrence_matrix = pd.DataFrame(0, index=range(n_topics), columns=range(n_topics))

for combination in topic_combinations:
    co_occurrence_matrix.at[combination] += np.sum((df['dominant_topic'] == combination[0]) & (df['dominant_topic'] == combination[1]))



In [58]:
import networkx as nx

G = nx.from_pandas_adjacency(co_occurrence_matrix)

# Girvan-Newman method
from networkx.algorithms.community import girvan_newman

communities_generator = girvan_newman(G)
top_level_communities = next(communities_generator)

# Convert communities to a more readable format
communities = [list(community) for community in top_level_communities]

# Print out the communities
for i, community in enumerate(communities):
    print(f"Community {i}: {community}")


Community 0: [0]
Community 1: [1]
Community 2: [2]
Community 3: [3]
Community 4: [4]
Community 5: [5]
Community 6: [6]
Community 7: [7]
Community 8: [8]
Community 9: [9]


In [None]:
import matplotlib.pyplot as plt

# Draw the network
plt.figure(figsize=(12, 12))
pos = nx.spring_layout(G)  # positions for all nodes

# Draw nodes
nx.draw_networkx_nodes(G, pos, node_size=700, cmap=plt.cm.RdYlBu, node_color=list(partition.values()))

# Draw edges
nx.draw_networkx_edges(G, pos, alpha=0.3)

# Draw labels
nx.draw_networkx_labels(G, pos, font_size=12, font_family='sans-serif')

plt.title("Topic Co-Occurrence Network")
plt.show()
