In [4]:
import csv
import collections
from collections import Counter
import re
from datetime import datetime
from datetime import time
import time
from geopy.geocoders import Nominatim
from sklearn.feature_extraction.text import TfidfVectorizer
import ast

In [2]:
input_file1 = r"tweets_tr_help_formatted.csv"
input_file1_name = "tr_help_formatted"
input_file2 = r"tweets_tr.csv"
input_file2_name = "tweets_tr"

In [None]:
"""
Loads two CSV Tweet datasets, 
cleans and analyzes their text to count top words, post times, dates, locations, and hashtags (both by frequency and engagement), 
and prints them
"""

text_column_name = 'content'

top_n = 100

def clean_text(text):
    #Cleans a string by converting it to lowercase, removing common Twitter artifacts, and stripping punctuation and symbols.

    cleaned_text = re.sub(r'http\S+|@\w+|#\w+', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    return cleaned_text.lower()

def get_word_distribution(file_path, column_name):
    word_counts = collections.Counter()
    try:
        with open(file_path, mode='r', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            
            if column_name not in reader.fieldnames:
                print(f"Error: The column '{column_name}' was not found in the file.")
                return None
            
            for row in reader:
                text = row.get(column_name, '')
                if text:
                    cleaned = clean_text(text)
                    words = cleaned.split()
                    word_counts.update(words)
                    
    except Exception as e:
        print(f"Exception")
        return None
        
    return word_counts

if __name__ == '__main__':
    word_counts1 = get_word_distribution(input_file1, text_column_name)
    word_counts2 = get_word_distribution(input_file2, text_column_name)

    if __name__ == '__main__':
        word_counts1 = get_word_distribution(input_file1, text_column_name)
        word_counts2 = get_word_distribution(input_file2, text_column_name)

        if word_counts1 and word_counts2:
            top_words1 = word_counts1.most_common(top_n)
            top_words2 = word_counts2.most_common(top_n)

            print(f"{'Top Words in ' + input_file1_name:<50} {'    Top Words in ' + input_file2_name:<50}")
            max_length = max(len(top_words1), len(top_words2))

            for i in range(max_length):
                word1, count1 = top_words1[i] if i < len(top_words1) else ('', '')
                word2, count2 = top_words2[i] if i < len(top_words2) else ('', '')
                print(f"{str(word1) + ' (' + str(count1) + ')':<60} {str(word2) + ' (' + str(count2) + ')':<70}")
                
          

In [None]:
"""
Reads each file’s date column, counts posts per hour, calculates the percentage for each hour, 
and prints an hourly posting distribution comparison for the two datasets
"""

date_column_name = 'date'

def get_hour_distribution(file_path, column_name):
    hour_counts = collections.Counter()
    total_posts = 0
    
    try:
        with open(file_path, mode='r', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            
            if column_name not in reader.fieldnames:
                print(f"Error: The column '{column_name}' was not found in '{file_path}'.")
                return None, None
            
            for row in reader:
                date_str = row.get(column_name, '')
                if date_str:
                    try:
                        dt_object = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S%z')
                        hour = dt_object.hour
                        hour_counts[hour] += 1
                        total_posts += 1
                    except ValueError as e:
                        print(f"Value error")
                        continue
                        

    except Exception as e:
        return None, None
        
    return hour_counts, total_posts

if __name__ == '__main__':
    hour_counts1, total_posts1 = get_hour_distribution(input_file1, date_column_name)
    hour_counts2, total_posts2 = get_hour_distribution(input_file2, date_column_name)

    if hour_counts1 and hour_counts2 and total_posts1 and total_posts2:
        print(f"{'Hour':<10} {'Posts in ' + input_file1 + ' (%)':<30} {'Posts in ' + input_file2 + ' (%)':<30}")
    
        for hour in range(24):
            percentage1 = (hour_counts1.get(hour, 0) / total_posts1) * 100
            percentage2 = (hour_counts2.get(hour, 0) / total_posts2) * 100

            print(f"{hour:<10} {percentage1:>28.2f}{percentage2:>28.2f}%")



Hour       Posts in tweets_tr_help_formatted.csv (%) Posts in tweets_tr.csv (%)    
0                                  1.31                        1.40%
1                                  1.24                        1.33%
2                                  0.82                        1.08%
3                                  0.73                        0.99%
4                                  1.15                        1.54%
5                                  1.31                        2.16%
6                                  1.88                        2.56%
7                                  2.58                        3.03%
8                                  3.53                        4.26%
9                                  4.94                        5.66%
10                                 5.26                        5.78%
11                                 7.65                        6.34%
12                                 6.24                        5.49%
13                 

In [6]:
"""
Reads each file’s date column, counts posts per calendar day, calculates daily posting percentages, 
and prints a day-by-day comparison along with total post counts
"""

date_column_name = 'date'

def get_date_distribution(file_path, column_name):
    date_counts = collections.Counter()
    total_posts = 0
    
    try:
        with open(file_path, mode='r', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            
            if column_name not in reader.fieldnames:
                print(f"Error: The column '{column_name}' was not found in '{file_path}'.")
                return None, None
            
            for row in reader:
                date_str = row.get(column_name, '')
                if date_str:
                    try:
                        dt_object = datetime.strptime(date_str, '%Y-%m-%d %H:%M:%S%z')
                        date_obj = dt_object.date()
                        date_counts[date_obj] += 1
                        total_posts += 1
                    except ValueError as e:
                        print(f"Warning: Could not parse date '{date_str}' in '{file_path}'. Error: {e}")
                        continue
                        
    except Exception as e:
        print(f"Exception")
        return None, None
        
    return date_counts, total_posts

if __name__ == '__main__':
    date_counts1, total_posts1 = get_date_distribution(input_file1, date_column_name)
    date_counts2, total_posts2 = get_date_distribution(input_file2, date_column_name)

    if date_counts1 and date_counts2 and total_posts1 and total_posts2:

        all_dates = sorted(list(date_counts1.keys() | date_counts2.keys()))
        print(f"{'Date':<15} {'Posts in ' + input_file1 + ' (%)':<35} {'|':<5} {'Posts in ' + input_file2 + ' (%)':<35}")

        for date_obj in all_dates:
            percentage1 = (date_counts1.get(date_obj, 0) / total_posts1) * 100
            percentage2 = (date_counts2.get(date_obj, 0) / total_posts2) * 100

            print(f"{str(date_obj):<15} {percentage1:>33.2f}%  {percentage2:>33.2f}%")

        print(f"Total posts in '{input_file1}': {total_posts1}")
        print(f"Total posts in '{input_file2}': {total_posts2}")


Date            Posts in tweets_tr_help_formatted.csv (%) |     Posts in tweets_tr.csv (%)         
2023-02-06                                  43.45%                              38.81%
2023-02-07                                  35.03%                              30.22%
2023-02-08                                  16.43%                              16.50%
2023-02-09                                   5.05%                               9.24%
2023-02-10                                   0.00%                               1.77%
2023-02-11                                   0.02%                               1.20%
2023-02-12                                   0.00%                               1.86%
2023-02-13                                   0.00%                               0.09%
2023-02-14                                   0.00%                               0.03%
2023-02-15                                   0.00%                               0.05%
2023-02-16                    

In [None]:
"""
Cleans, and concatenates all text from each file, computes TF-IDF scores for every word, 
and prints the top-scoring terms from each dataset
"""
text_column_name = 'content'

top_n = 20

def clean_text(text):
    cleaned_text = re.sub(r'http\S+|@\w+|#\w+', '', text, flags=re.MULTILINE)
    cleaned_text = re.sub(r'[^\w\s]', '', cleaned_text)
    return cleaned_text.lower()

def get_text_from_file(file_path, column_name):

    all_text = []
    
    try:
        with open(file_path, mode='r', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)
            
            if column_name not in reader.fieldnames:
                print(f"Error: The column '{column_name}' was not found in '{file_path}'.")
                return None
            
            for row in reader:
                text = row.get(column_name, '')
                if text:
                    cleaned = clean_text(text)
                    all_text.append(cleaned)
                    
    except Exception as e:
        print(f"Exception")
        return None
        
    return " ".join(all_text)

if __name__ == '__main__':
    document1 = get_text_from_file(input_file1, text_column_name)
    document2 = get_text_from_file(input_file2, text_column_name)

    if document1 and document2:
        corpus = [document1, document2]
        vectorizer = TfidfVectorizer()
        
        tfidf_matrix = vectorizer.fit_transform(corpus)
        feature_names = vectorizer.get_feature_names_out()
        
        tfidf_scores1 = tfidf_matrix[0].toarray().flatten()
        top_words1 = sorted(zip(feature_names, tfidf_scores1), key=lambda x: x[1], reverse=True)[:top_n]
        tfidf_scores2 = tfidf_matrix[1].toarray().flatten()
        top_words2 = sorted(zip(feature_names, tfidf_scores2), key=lambda x: x[1], reverse=True)[:top_n]

        print(f"{'Top TF-IDF Words in ' + input_file1:<50} {'Top TF-IDF Words in ' + input_file2:<50}")
        
        max_length = max(len(top_words1), len(top_words2))
        for i in range(max_length):
            word1, score1 = top_words1[i] if i < len(top_words1) else ('', '')
            word2, score2 = top_words2[i] if i < len(top_words2) else ('', '')
            
            print(f"{str(word1):<25} ({str(f'{score1:.4f}'):<15}) {str(word2):<25} ({str(f'{score2:.4f}'):<15})")
     


In [None]:
"""
Checks each file for coordinate or place data in the specified columns, counts how many rows contain each type, 
and prints total posts, and their location data
"""
want_coord = "coordinates" 
want_place = "place"       

EMPTY_TOKENS = {"", "none", "null", "nan", "[]", "{}", "na", "n/a"}

# Patterns for coordinates
pat_coords1 = re.compile(r"Coordinates\(\s*longitude=([\-+]?\d+(?:\.\d+)?),\s*latitude=([\-+]?\d+(?:\.\d+)?)\s*\)", re.I)
pat_coords2 = re.compile(r"Coordinates\(\s*latitude=([\-+]?\d+(?:\.\d+)?),\s*longitude=([\-+]?\d+(?:\.\d+)?)\s*\)", re.I)
pat_plain   = re.compile(r"^\s*([\-+]?\d+(?:\.\d+)?)\s*,\s*([\-+]?\d+(?:\.\d+)?)\s*$")

def norm(s):
    return (s or "").strip()

def is_empty_val(s):
    return norm(s).lower() in EMPTY_TOKENS

def has_coords(s):
    s = norm(s)
    if not s or is_empty_val(s): return False
    if pat_coords1.search(s) or pat_coords2.search(s): return True
    if pat_plain.match(s): return True
    return False

def has_place(s):
    s = norm(s)
    if not s or is_empty_val(s): return False

    if s.startswith("Place(") or "fullName='" in s or "name='" in s:
        return True

    return any(ch.isalpha() for ch in s)

def find_column(fieldnames, target):
    target_n = target.strip().lower()
    for fn in fieldnames:
        if fn and fn.strip().lower() == target_n:
            return fn
    return None

def analyze(file_path):
    total = with_coords = with_place_ = with_any = 0
    with open(file_path, "r", encoding="utf-8-sig", newline="") as f:
        reader = csv.DictReader(f)
        if not reader.fieldnames:
            return (0, 0, 0, 0)
        coord_col = find_column(reader.fieldnames, want_coord)
        place_col = find_column(reader.fieldnames, want_place)
        if not coord_col and not place_col:
            return (0, 0, 0, 0)

        for row in reader:
            total += 1
            cval = row.get(coord_col, "") if coord_col else ""
            pval = row.get(place_col, "") if place_col else ""

            hc = has_coords(cval)
            hp = has_place(pval)

            if hc: with_coords += 1
            if hp: with_place_ += 1
            if hc or hp: with_any += 1

    return (total, with_coords, with_place_, with_any)

def pct(part, whole):
    return f"{(part/whole*100):.2f}%" if whole else "0.00%"

if __name__ == "__main__":
    t1, c1, p1, a1 = analyze(input_file1)
    t2, c2, p2, a2 = analyze(input_file2)

   
    print("Location Analysis for " + input_file1 + "\t" + "Location Analysis for " + input_file2)
    print(f"Total Posts: {t1}\tTotal Posts: {t2}")
    print(f"With Coordinates: {c1} ({pct(c1,t1)})\tWith Coordinates: {c2} ({pct(c2,t2)})")
    print(f"With Place: {p1} ({pct(p1,t1)})\tWith Place: {p2} ({pct(p2,t2)})")
    print(f"With Any Location: {a1} ({pct(a1,t1)})\tWith Any Location: {a2} ({pct(a2,t2)})")


In [None]:
"""
Counts how many times each place value appears in the given CSV column for both files, 
then prints the most common locations along with the total number of unique locations in each dataset
"""

def get_location_distribution(file_path, place_col):
    location_counts = collections.Counter()
    
    try:
        with open(file_path, mode='r', encoding='utf-8') as infile:
            reader = csv.DictReader(infile)

            for row in reader:
                place = row.get(place_col)
    
                if place:
                    location_counts[place] += 1

    except Exception as e:
        print(f"Exception")
        return None
        
    return location_counts

if __name__ == '__main__':
    location_counts1 = get_location_distribution(input_file1, "place" )
    location_counts2 = get_location_distribution(input_file2, "place")

    if location_counts1 is not None and location_counts2 is not None:
        top_locations1 = location_counts1.most_common()
        top_locations2 = location_counts2.most_common()

        print(f"{'Top Locations in ' + input_file1:<50}{'Top Locations in ' + input_file2:<50}")
   
        max_length = max(len(top_locations1), len(top_locations2))
        for i in range(max_length):

            location1, count1 = top_locations1[i] if i < len(top_locations1) else ('', '')
            location2, count2 = top_locations2[i] if i < len(top_locations2) else ('', '')
            
            print(f"{str(location1):<25} ({str(count1):<15}) {str(location2):<25} ({str(count2):<15})")
            
        print(f"Total unique locations in '{input_file1}': {len(location_counts1)}")
        print(f"Total unique locations in '{input_file2}': {len(location_counts2)}")

In [None]:
"""
Reads each row of a CSV, parses coordinates or a place string, 
reverse forward geocodes it with Nominatim to get province/district/country, 
and prints those alongside the Tweet’s date

WARNING: When using Nominatim, do not set delay time below 1 second,
as this can result in your IP being banned for excessive requests
"""

user_agent = "tweet_geo_lookup"
delay_s = 1  
lang = "tr"    

geolocator = Nominatim(user_agent=user_agent)

coord_pattern = re.compile(
    r"longitude=([+-]?\d+(?:\.\d+)?),\s*latitude=([+-]?\d+(?:\.\d+)?)",
    re.IGNORECASE
)

# Caches to avoid repeated API calls
coord_cache = {}      
placename_cache = {}  

def parse_coordinates(coord_str: str):
    if not coord_str:
        return None
    m = coord_pattern.search(coord_str)
    if not m:
        return None
    lon, lat = float(m.group(1)), float(m.group(2))
    return (lat, lon)  

def reverse_to_admin(latlon):
    if latlon in coord_cache:
        return coord_cache[latlon]
    province = district = cc = None
    try:
        loc = geolocator.reverse(latlon, language=lang, addressdetails=True)
        if loc and loc.raw and "address" in loc.raw:
            addr = loc.raw["address"]
            # Türkiye: province can come under 'state' or 'province'
            province = addr.get("state") or addr.get("province")
            # District can appear as 'county' or 'state_district' or 'city'
            district = addr.get("state_district") or addr.get("county") or addr.get("city")
            cc = (addr.get("country_code") or "").upper()
    except Exception as e:
        print(f"reverse error for {latlon}: {e}")
    coord_cache[latlon] = (province, district, cc)
    return coord_cache[latlon]

def extract_place_name(place_cell: str):

    if not place_cell:
        return None
    m = re.search(r"fullName='([^']+)'", place_cell)
    if m:
        return m.group(1)
    m2 = re.search(r"name='([^']+)'", place_cell)
    if m2:
        return m2.group(1)
    return None

def forward_place_to_admin(place_str: str):

    if place_str in placename_cache:
        return placename_cache[place_str]
    province = district = cc = None
    try:
        
        query = f"{place_str}, Türkiye" if "Türkiye" not in place_str else place_str
        loc = geolocator.geocode(query, language=lang, addressdetails=True)
        if loc and loc.raw and "address" in loc.raw:
            addr = loc.raw["address"]
            province = addr.get("state") or addr.get("province")
            district = addr.get("state_district") or addr.get("county") or addr.get("city")
            cc = (addr.get("country_code") or "").upper()
    except Exception as e:
        print("Exception error") 

    placename_cache[place_str] = (province, district, cc)
    return placename_cache[place_str]

with open(input_file1, newline="", encoding="utf-8") as infile:
    reader = csv.DictReader(infile)

    for row in reader:
        province = district = cc = None

        latlon = parse_coordinates((row.get("coordinates") or "").strip())
        if latlon:
            province, district, cc = reverse_to_admin(latlon)
            # Sleep only if we actually called the API 
            if coord_cache.get(latlon) == (province, district, cc) and delay_s > 0:
                time.sleep(delay_s)

        if not province:
            place_cell = (row.get("place") or "").strip()
            place_name = extract_place_name(place_cell)
            if place_name:
                province2, district2, cc2 = forward_place_to_admin(place_name)
                if placename_cache.get(place_name) == (province2, district2, cc2) and delay_s > 0:
                    time.sleep(delay_s)
                province = province or province2
                district = district or district2
                cc = cc or cc2

        if province or district or cc:
            print(f"{row.get('date')}, province: {province}, district: {district}, country: {cc}")


2023-02-09 17:29:51+00:00, province: İstanbul, district: None, country: TR
2023-02-09 14:42:51+00:00, province: Bretagne, district: Côtes-d'Armor, country: FR
2023-02-09 14:42:01+00:00, province: Hatay, district: None, country: TR


KeyboardInterrupt: 

In [9]:

top_n        = 50
by_engagement = True  # False gives frequency, True gives engagement

def parse_hashtags(cell):
    if not cell:
        return []
    try:
        tags = ast.literal_eval(cell)
        if not isinstance(tags, list):
            return []
        seen = set()
        for t in tags:
            if isinstance(t, str):
                tt = t.strip().lstrip("#").lower()
                if tt:
                    seen.add(tt)
        return list(seen)
    except Exception:
        return []

freq = Counter()
weighted = Counter()

with open(input_file1, newline="", encoding="utf-8") as f:
    reader = csv.DictReader(f)
    for row in reader:
        tags = parse_hashtags(row.get("hashtags", ""))

        if not by_engagement:
            freq.update(tags)
        else: 
            try:
                likes = float(row.get("like_count") or 0)
                rts   = float(row.get("rt_count") or 0)
            except ValueError:
                likes = rts = 0.0
            weight = likes + rts
            if tags and weight > 0:
                for t in tags:
                    weighted[t] += weight

if not by_engagement:
    print("Top Hashtags by Frequency")
    print("hashtag\tcount")
    for tag, c in freq.most_common(top_n):
        print(f"{tag}\t{c}")
else:
    print("Top Hashtags by Engagement (likes+retweets)")
    print("hashtag\tscore")
    for tag, s in weighted.most_common(top_n):
        s_out = int(s) if abs(s - int(s)) < 1e-9 else f"{s:.1f}"
        print(f"{tag}\t{s_out}")


Top Hashtags by Engagement (likes+retweets)
hashtag	score
turkey	1108990
deprem	997105
seferberlik	299579
afad	294006
hatay	288861
earthquake	239642
hatayyardimbekliyor	239210
enkazaltındayım	195678
ohal	174347
sondakikadeprem	146940
prayforturkey	144520
enkaz	128801
hataydeprem	121965
aci̇l	116100
elbistan	112577
sondakika	100478
enkazalti̇ndayi̇m	97487
gaziantep	96608
kahramanmaras	92490
depremoldu	80701
turkeyearthquake	78092
helpturkey	69483
ahbap	67291
acildeprem	64440
yardim	63379
iskenderun	61290
malatya	56999
oguzhanugur	54937
afadhatay	51720
haarp	50311
haluklevent	43550
afaddeprem	42792
turkiye	41693
adiyaman	41588
adıyaman	39102
acil	37723
maras	37331
kahramanmaraş	37187
hatayiskenderun	36540
depremsondakika	36463
hataydepremi	35555
kilis	33723
osmaniye	32782
diyarbakır	31300
adana	31295
antakya	30789
acilyardim	29822
nurdagi	29804
turkeyquake	27729
mardin	26861
