In [None]:
import os
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import networkx as nx
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_tweet(tweet):
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    tweet = re.sub(r"[^\w\s]", '', tweet)
    tokens = word_tokenize(tweet)
    stop_words = set(stopwords.words('english'))
    tokens = [word for word in tokens if word.lower() not in stop_words]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(word) for word in tokens]
    return " ".join(tokens)

def process_folder(folder_path):
    processed_dfs = {}
    for file_name in os.listdir(folder_path):
        if file_name.endswith('.csv'):
            file_path = os.path.join(folder_path, file_name)
            df = pd.read_csv(file_path)
            df = df[~df["Tweet"].str.startswith("RT")]
            df = df.drop_duplicates(subset=["Tweet"])
            df = df[~df["Tweet"].str.contains("@")]
            df["Tweet"] = df["Tweet"].apply(preprocess_tweet)
            df = df.dropna(subset=["Tweet"])
            df = df[df["Tweet"].str.strip() != ""]
            processed_dfs[file_name] = df
    
    return processed_dfs

def save_processed_csvs(processed_dfs, output_folder):
    os.makedirs(output_folder, exist_ok=True)
    for file_name, df in processed_dfs.items():
        output_path = os.path.join(output_folder, file_name)  
        df.to_csv(output_path, index=False)

In [None]:
input_folder = "input"
output_folder = "output"
processed_dataframes = process_folder(input_folder)
save_processed_csvs(processed_dataframes, output_folder)
print(f"Processed files saved to {output_folder}")

In [None]:
def create_graph_of_words(tweet):
    words = tweet.split()
    n = len(words)
    if n <= 1:
        return nx.Graph()
    G = nx.complete_graph(words)
    weight = 1 / (n - 1)
    for u, v in G.edges():
        G[u][v]['weight'] = weight
    return G

def aggregate_id_graph(tweets):
    G_id = nx.Graph()
    for tweet in tweets:
        G_tweet = create_graph_of_words(tweet)
        for u, v, data in G_tweet.edges(data=True):
            if G_id.has_edge(u, v):
                G_id[u][v]['weight'] += data['weight']
            else:
                G_id.add_edge(u, v, weight=data['weight'])
    return G_id

def extract_graph_features(graph):
    num_nodes = graph.number_of_nodes()
    num_edges = graph.number_of_edges()
    avg_edge_weight = np.mean([data['weight'] for _, _, data in graph.edges(data=True)]) if num_edges > 0 else 0
    degree_centrality = sum(dict(nx.degree_centrality(graph)).values()) / num_nodes if num_nodes > 0 else 0

    return [num_nodes, num_edges, avg_edge_weight, degree_centrality]