In [2]:
import csv
import collections
from collections import Counter
import re
from datetime import datetime
from datetime import time
import time
from geopy.geocoders import Nominatim
from sklearn.feature_extraction.text import TfidfVectorizer
import ast

In [6]:
input_file1 = r"Aug10_earthquake_tweets.csv"
input_file2 = r"Aug10_earthquake_tweets_media.csv"

In [None]:
"""
Loads a CSV, 
cleans and analyzes its text to count top words
"""

text_column_name = 'text'
top_n = 100

def clean_text(text):
    cleaned_text = re.sub(r'http\S+|@\w+|#\w+', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    return cleaned_text.lower()

def get_word_distribution(file_path, column_name):
    word_counts = collections.Counter()
    try:
        with open(file_path, mode='r', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            if column_name not in reader.fieldnames:
                print(f"Error: The column '{column_name}' was not found in the file.")
                return None
            for row in reader:
                text = row.get(column_name, '')
                if text:
                    cleaned = clean_text(text)
                    words = cleaned.split()
                    word_counts.update(words)
    except Exception as e:
        print(f"Exception: {e}")
        return None
    return word_counts

if __name__ == '__main__':
    word_counts = get_word_distribution(input_file2, text_column_name)
    if word_counts:
        top_words = word_counts.most_common(top_n)
        print(f"Top {top_n} Words in {input_file2}")
        for word, count in top_words:
            print(f"{word} ({count})")


Top 100 Words in Aug10_earthquake_tweets_media.csv
deprem (2442)
olsun (2315)
bir (2045)
balıkesir (2026)
geçmiş (2021)
ve (1838)
meydana (1243)
61 (1153)
balıkesirde (1080)
de (951)
sındırgı (937)
gelen (873)
bu (805)
tüm (760)
her (605)
etkilenen (560)
vatandaşlarımıza (560)
geldi (559)
depremde (542)
çok (539)
depremden (539)
can (528)
yıkılan (523)
hissedilen (517)
bina (513)
büyüklüğünde (510)
en (495)
büyüklüğündeki (474)
oldu (464)
sındırgıda (457)
allah (438)
da (432)
sonrası (426)
korusun (425)
ne (424)
olan (405)
var (394)
rabbim (391)
için (375)
kaybı (355)
çevre (348)
daha (347)
merkez (342)
balıkesirin (342)
ilçesinde (327)
şener (326)
yine (316)
1 (314)
dileklerimizi (311)
üssü (307)
depremin (292)
önce (280)
yok (279)
62 (275)
altında (273)
türlü (268)
hissedildi (267)
kişi (264)
iletiyoruz (261)
i̇stanbul (257)
depremi (256)
büyük (251)
türkiye (251)
enkaz (246)
gsm (243)
milletimizi (238)
km (231)
illerde (231)
üşümezsoy (231)
yıkıldı (229)
büyüklük (226)
ama (226)
baş

In [None]:
"""
Reads a CSV file’s date column, counts posts per hour, calculates the percentage for each hour, 
and prints an hourly posting distribution
"""
date_column_name = 'created_at'

def get_hour_distribution(file_path, column_name):
    hour_counts = collections.Counter()
    total_posts = 0
    try:
        with open(file_path, mode='r', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            if column_name not in reader.fieldnames:
                print(f"Error: The column '{column_name}' was not found in '{file_path}'.")
                return None, None
            for row in reader:
                date_str = row.get(column_name, '')
                if date_str:
                    try:
                        dt_object = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S %z')
                        hour = dt_object.hour
                        hour_counts[hour] += 1
                        total_posts += 1
                    except ValueError:
                        continue
    except Exception:
        return None, None
    return hour_counts, total_posts

if __name__ == '__main__':
    hour_counts1, total_posts1 = get_hour_distribution(input_file2, date_column_name)
    if hour_counts1 and total_posts1:
        print(f"{'Hour':<10} {'Posts in ' + input_file2 + ' (%)':<30}")
        for hour in range(24):
            percentage1 = (hour_counts1.get(hour, 0) / total_posts1) * 100
            print(f"{hour:<10} {percentage1:>28.2f}%")


Hour       Posts in Aug10_earthquake_tweets_media.csv (%)
0                                  4.80%
1                                  2.11%
2                                  0.95%
3                                  0.46%
4                                  0.61%
5                                  0.63%
6                                  0.73%
7                                  0.10%
8                                  0.00%
9                                  0.00%
10                                 0.00%
11                                 0.00%
12                                 0.00%
13                                 0.00%
14                                 0.00%
15                                 0.00%
16                                 0.00%
17                                 0.00%
18                                 0.00%
19                                 2.96%
20                                46.50%
21                                20.77%
22                                11.65%

In [None]:
"""
Reads the CSV file’s created_at column, counts posts per day, calculates daily posting percentages, 
and prints the distribution
"""
date_column_name = 'created_at'

def get_date_distribution(file_path, column_name):
    date_counts = collections.Counter()
    total_posts = 0
    try:
        with open(file_path, mode='r', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            for row in reader:
                date_str = row.get(column_name, '')
                if date_str:
                    try:
                        dt_object = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S %z')
                        date_obj = dt_object.date()
                        date_counts[date_obj] += 1
                        total_posts += 1
                    except ValueError:
                        continue
    except Exception:
        return None, None
    return date_counts, total_posts

if __name__ == '__main__':
    date_counts1, total_posts1 = get_date_distribution(input_file2, date_column_name)
    if date_counts1 and total_posts1:
        all_dates = sorted(date_counts1.keys())
        print(f"{'Date':<15} {'Posts in ' + input_file2 + ' (%)':<35}")
        for date_obj in all_dates:
            percentage1 = (date_counts1.get(date_obj, 0) / total_posts1) * 100
            print(f"{str(date_obj):<15} {percentage1:>33.2f}%")
        print(f"Total posts in '{input_file2}': {total_posts1}")


Date            Posts in Aug10_earthquake_tweets_media.csv (%)
2025-08-10                                  89.61%
2025-08-11                                  10.39%
Total posts in 'Aug10_earthquake_tweets_media.csv': 8752


In [None]:
"""
Computes TF-IDF scores for every word, 
and prints the top-scoring terms 
"""

text_column_name = 'text'
top_n = 20

def clean_text(text):
    cleaned_text = re.sub(r'http\S+|@\w+|#\w+', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    return cleaned_text.lower()

def get_text_from_file(file_path, column_name):
    all_text = []
    try:
        with open(file_path, mode='r', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            if column_name not in reader.fieldnames:
                print(f"Error: The column '{column_name}' was not found in '{file_path}'.")
                return None
            for row in reader:
                text = row.get(column_name, '')
                if text:
                    cleaned = clean_text(text)
                    all_text.append(cleaned)
    except Exception:
        return None
    return " ".join(all_text)

if __name__ == '__main__':
    document1 = get_text_from_file(input_file2, text_column_name)
    if document1:
        corpus = [document1]
        vectorizer = TfidfVectorizer()
        tfidf_matrix = vectorizer.fit_transform(corpus)
        feature_names = vectorizer.get_feature_names_out()
        tfidf_scores = tfidf_matrix[0].toarray().flatten()
        top_words = sorted(zip(feature_names, tfidf_scores), key=lambda x: x[1], reverse=True)[:top_n]
        print(f"Top TF-IDF Words in {input_file2}")
        for word, score in top_words:
            print(f"{word:<25} ({score:.4f})")


Top TF-IDF Words in Aug10_earthquake_tweets_media.csv
deprem                    (0.3552)
olsun                     (0.3367)
bir                       (0.2975)
balıkesir                 (0.2947)
geçmiş                    (0.2940)
ve                        (0.2674)
meydana                   (0.1808)
61                        (0.1677)
balıkesirde               (0.1571)
de                        (0.1383)
sındırgı                  (0.1363)
gelen                     (0.1270)
bu                        (0.1171)
tüm                       (0.1106)
her                       (0.0880)
geldi                     (0.0822)
etkilenen                 (0.0815)
vatandaşlarımıza          (0.0815)
depremde                  (0.0788)
depremden                 (0.0784)


In [None]:
"""
Finds most common hashtags
based on frequency or engagement
and prints them
"""

top_n = 50
by_engagement = True #False to give by frequency, True to give by engagement

def parse_hashtags(cell):
    if not cell:
        return []
    try:
        tags = ast.literal_eval(cell)
        if not isinstance(tags, list):
            return []
        seen = set()
        for t in tags:
            if isinstance(t, str):
                tt = t.strip().lstrip("#").lower()
                if tt:
                    seen.add(tt)
        return list(seen)
    except Exception:
        return []

freq = Counter()
weighted = Counter()

with open(input_file2, newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        tags = parse_hashtags(row.get("hashtags", ""))
        if not by_engagement:
            freq.update(tags)
        else:
            try:
                likes = float(row.get("like_count") or 0)
                rts = float(row.get("retweet_count") or 0)
            except ValueError:
                likes = rts = 0.0
            weight = likes + rts
            if tags and weight > 0:
                for t in tags:
                    weighted[t] += weight

if not by_engagement:
    print("Top Hashtags by Frequency")
    print("hashtag\tcount")
    for tag, c in freq.most_common(top_n):
        print(f"{tag}\t{c}")
else:
    print("Top Hashtags by Engagement (likes+retweets)")
    print("hashtag\tscore")
    for tag, s in weighted.most_common(top_n):
        s_out = int(s) if abs(s - int(s)) < 1e-9 else f"{s:.1f}"
        print(f"{tag}\t{s_out}")


Top Hashtags by Engagement (likes+retweets)
hashtag	score
deprem	631775
balıkesir	77836
depremoldu	37984
sallandık	27206
afad	26655
balikesir	17170
sındırgı	13196
sondakika	11548
sondaki̇ka	9147
gazzeyei̇nsanikoridor	8452
istanbuldeprem	4784
gsm	4408
izmir	4384
bursa	3786
istanbul	3006
manisa	2981
i̇stanbul	2005
çöktü	1986
ankara	1891
izmirdeprem	1835
gazzeyeumutol	1728
earthquake	1708
gazze	1523
türkiyedekatliamvar	1423
balıkesirdeprem	1421
sokakhayvanlarısahipsizdeğildir	1314
geçmişolsun	1197
emekliultrafakir	1184
5000kısmiultramağdur	1183
hissettik	1105
chpkomisyondançik	1077
devletmitingyapmazgereğiniyapar	845
aysetokyazi̇cinadalet	841
türkiye	797
depremiunutmaunutturma	676
emekliyearazaminsanihaktır	653
yangın	651
afat	581
trump_und_putin	545
demokratie	545
gazzeaçlıktanölüyor	534
pazar	491
şenerüşümezsoy	487
gündem	484
amedspor	482
gaza	437
31temmuzcovidyasası	431
sındırgıdeprem	421
turkcell	413
pazartesi	392
