# Import Dependencies

In [8]:
import os
import json

# Directories

In [2]:
pheme_dir = 'data/pheme-rnr-dataset'

# Import the datasets

In [3]:
import os
import pandas as pd

# List to store the data
data = []

# Iterate over the events
for event_folder in os.listdir(pheme_dir):
    event_path = os.path.join(pheme_dir, event_folder)
    if os.path.isdir(event_path):
        # Check if the event folder contains both 'rumours' and 'non-rumours' subfolders
        rumours_path = os.path.join(event_path, 'rumours')
        non_rumours_path = os.path.join(event_path, 'non-rumours')
        if os.path.exists(rumours_path) and os.path.exists(non_rumours_path):
            # Load rumors
            rumours = os.listdir(rumours_path)
            for tweet_id in rumours:
                tweet_path = os.path.join(rumours_path, tweet_id, 'source-tweet')
                tweet_text = None
                for file_name in os.listdir(tweet_path):
                    if file_name.endswith('.json'):
                        with open(os.path.join(tweet_path, file_name), 'r', encoding='utf-8') as file:
                            tweet_data = json.load(file)
                            tweet_text = tweet_data['text']
                            break
                if tweet_text:
                    data.append({'Event': event_folder, 'Tweet ID': tweet_id, 'Label': 'rumor', 'Tweet Text': tweet_text})
            # Load non-rumours
            non_rumours = os.listdir(non_rumours_path)
            for tweet_id in non_rumours:
                tweet_path = os.path.join(non_rumours_path, tweet_id, 'source-tweet')
                tweet_text = None
                for file_name in os.listdir(tweet_path):
                    if file_name.endswith('.json'):
                        with open(os.path.join(tweet_path, file_name), 'r', encoding='utf-8') as file:
                            tweet_data = json.load(file)
                            tweet_text = tweet_data['text']
                            break
                if tweet_text:
                    data.append({'Event': event_folder, 'Tweet ID': tweet_id, 'Label': 'non-rumor', 'Tweet Text': tweet_text})

# Create a DataFrame from the collected data
pheme_df = pd.DataFrame(data)
print(pheme_df.head())


          Event            Tweet ID  Label  \
0  charliehebdo  552783238415265792  rumor   
1  charliehebdo  552783667052167168  rumor   
2  charliehebdo  552783745565347840  rumor   
3  charliehebdo  552784168849907712  rumor   
4  charliehebdo  552784526955806720  rumor   

                                          Tweet Text  
0  Breaking: At least 10 dead, 5 injured after tO...  
1  France: 10 people dead after shooting at HQ of...  
2  Ten killed in shooting at headquarters of Fren...  
3  BREAKING: 10 dead in shooting at headquarters ...  
4  Reuters: 10 people shot dead at headquarters o...  


# Data Cleaning

In [4]:
from main_model.scripts.data_cleaning import clean_text

# Assuming you have a DataFrame 'df' with a column 'Tweet Text'
pheme_df['Cleaned Text'] = pheme_df['Tweet Text'].apply(clean_text)

print(pheme_df.head())


          Event            Tweet ID  Label  \
0  charliehebdo  552783238415265792  rumor   
1  charliehebdo  552783667052167168  rumor   
2  charliehebdo  552783745565347840  rumor   
3  charliehebdo  552784168849907712  rumor   
4  charliehebdo  552784526955806720  rumor   

                                          Tweet Text  \
0  Breaking: At least 10 dead, 5 injured after tO...   
1  France: 10 people dead after shooting at HQ of...   
2  Ten killed in shooting at headquarters of Fren...   
3  BREAKING: 10 dead in shooting at headquarters ...   
4  Reuters: 10 people shot dead at headquarters o...   

                                        Cleaned Text  
0  breaking at least  dead  injured after to gunm...  
1  france  people dead after shooting at hq of sa...  
2  ten killed in shooting at headquarters of fren...  
3  breaking  dead in shooting at headquarters of ...  
4  reuters  people shot dead at headquarters of f...  


# POS Tagging

In [5]:
from sklearn.model_selection import train_test_split
import nltk

# Define a function to tag the text
def tag_text(text):
    words = nltk.word_tokenize(text)
    tagged_words = nltk.pos_tag(words)
    return tagged_words

pheme_df['POS Tagged Text'] = pheme_df['Cleaned Text'].apply(tag_text)

# Split the dataset into training, validation, and test sets
train_df, temp_df = train_test_split(pheme_df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

# Prepare the training data
train_data = []
for tagged_text in train_df['POS Tagged Text']:
    train_data.append(tagged_text)

# Prepare the validation data
val_data = []
for tagged_text in val_df['POS Tagged Text']:
    val_data.append(tagged_text)

In [6]:
print(train_data)



In [7]:
print(val_data)

[[('horrible', 'JJ'), ('tragedy', 'NN'), ('in', 'IN'), ('paris', 'NN'), ('today', 'NN'), ('was', 'VBD'), ('not', 'RB'), ('done', 'VBN'), ('by', 'IN'), ('muslims', 'NNS'), ('but', 'CC'), ('done', 'VBN'), ('by', 'IN'), ('terrorists', 'NNS'), ('let', 'VBP'), ('us', 'PRP'), ('unite', 'VB'), ('against', 'IN'), ('hate', 'NN'), ('pray', 'NN'), ('for', 'IN'), ('peace', 'NN'), ('charliehebdo', 'NN')], [('pm', 'NN'), ('stephen', 'VBZ'), ('harper', 'JJR'), ('to', 'TO'), ('address', 'VB'), ('the', 'DT'), ('nation', 'NN'), ('on', 'IN'), ('ottawashooting', 'VBG'), ('at', 'IN'), ('pmet', 'JJ'), ('pmharper', 'NN'), ('cdnpoli', 'NN')], [('there', 'EX'), ('will', 'MD'), ('be', 'VB'), ('no', 'DT'), ('blackout', 'NN'), ('or', 'CC'), ('censoring', 'VBG'), ('the', 'DT'), ('global', 'JJ'), ('community', 'NN'), ('from', 'IN'), ('witnessing', 'VBG'), ('this', 'DT'), ('human', 'JJ'), ('travesty', 'JJ'), ('unity', 'NN'), ('ferguson', 'NN')], [('to', 'TO'), ('people', 'NNS'), ('taking', 'VBG'), ('selfies', 'NNS')

In [11]:
from main_model.scripts.pos_tagging import pos_tagging

# Assuming 'train_data' and 'val_df' are defined elsewhere
best_tagger_name, best_tagger, best_tagger_accuracy = pos_tagging(train_data, val_df)

print("\nBest Tagger:", best_tagger_name)
print("Accuracy:", best_tagger_accuracy)

Unigram Tagger Accuracy: 0.8367168867430632
Bigram Tagger Accuracy: 0.8597521959164679
Trigram Tagger Accuracy: 0.8580652667093246
HMM Tagger Accuracy: 0.8459077424233611

Best Tagger: Bigram
Accuracy: 0.8597521959164679


In [12]:
from nltk import word_tokenize

# Assuming 'best_tagger' is the best tagger selected
tagged_tweets = []
for tweet_text in pheme_df['Tweet Text']:
    tagged_tweet = best_tagger.tag(word_tokenize(tweet_text))
    tagged_tweets.append(tagged_tweet)

pheme_df['POS Tagged Text'] = tagged_tweets


In [13]:
print(pheme_df)

             Event            Tweet ID      Label  \
0     charliehebdo  552783238415265792      rumor   
1     charliehebdo  552783667052167168      rumor   
2     charliehebdo  552783745565347840      rumor   
3     charliehebdo  552784168849907712      rumor   
4     charliehebdo  552784526955806720      rumor   
...            ...                 ...        ...   
5797   sydneysiege  544521260840267776  non-rumor   
5798   sydneysiege  544521433473634304  non-rumor   
5799   sydneysiege  544521788777304064  non-rumor   
5800   sydneysiege  544521880661950464  non-rumor   
5801   sydneysiege  544521948924248066  non-rumor   

                                             Tweet Text  \
0     Breaking: At least 10 dead, 5 injured after tO...   
1     France: 10 people dead after shooting at HQ of...   
2     Ten killed in shooting at headquarters of Fren...   
3     BREAKING: 10 dead in shooting at headquarters ...   
4     Reuters: 10 people shot dead at headquarters o...   
...      

# Lemmatization