In [134]:
import os
import nltk
import math
import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
import matplotlib.pyplot as plt

In [24]:
#nltk.download('punkt_tab')

In [26]:
data_folder = Path(os.getcwd()).parents[1].joinpath('data')

In [86]:
train_df = pd.read_csv(data_folder.joinpath('train_clean.csv'), index_col=0)
test_df = pd.read_csv(data_folder.joinpath('test.csv'))

In [118]:
all_df = pd.concat([train_df,test_df])
all_df.head()

Unnamed: 0,id,keyword,location,text,target,valid_location
0,1,,,Our Deeds are the Reason of this #earthquake M...,1.0,0.0
1,4,,,Forest fire near La Ronge Sask. Canada,1.0,0.0
2,5,,,All residents asked to 'shelter in place' are ...,1.0,0.0
3,6,,,"13,000 people receive #wildfires evacuation or...",1.0,0.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1.0,0.0


In [99]:
def create_bow_from_df(text_df):
    # create the vocabulary
    vocab = set()

    # create the bag-of-words model
    bow_model = []

    for _,row in text_df.iterrows():
        # create a dictionary to store the word counts
        word_counts = {}

        # tokenize the text
        tokens = nltk.word_tokenize(row['text'])
        tokens = [x.lower() for x in tokens]
        tokens = [x for x in tokens if x.isalnum()]

        # update the vocabulary
        vocab.update(tokens)

        # count the occurrences of each word
        for word in tokens:
            if word in word_counts:
                word_counts[word] += 1
            else:
                word_counts[word] = 1
        
        word_counts['target_label']=row['target'] if not math.isnan(row['target']) else 2
        word_counts['id'] = row['id']
        # add the word counts to the bag-of-words model
        bow_model.append(word_counts)
    return bow_model

In [100]:
bow_model = create_bow_from_df(all_df)
bow_df = pd.DataFrame(bow_model)

In [101]:
#replace Nans with zeros
bow_df = bow_df.fillna(0)

In [102]:
bow_df.head(5)

Unnamed: 0,our,deeds,are,the,reason,of,this,earthquake,may,allah,...,begovic,reserve,hattrick,ebolaoutbreak,ala,rajman,hasaka,risen,fasteners,xrwn
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [105]:
bow_train_df = bow_df.loc[bow_df['target_label'] != 2]
bow_test_df = bow_df.loc[bow_df['target_label'] == 2]

## Model Building

In [106]:
X_train, X_test, y_train, y_test = train_test_split(bow_train_df.drop(["target_label","id"], axis=1), bow_train_df["target_label"], test_size=0.15)

In [128]:
clf = LogisticRegression(random_state=0, max_iter= 200, solver='liblinear').fit(X_train, y_train)

In [129]:
y_pred = clf.predict(X_test)

In [130]:
f1_score(y_test, y_pred)

0.7338444687842278

## Score Test Data

In [111]:
bow_test_df.head()

Unnamed: 0,our,deeds,are,the,reason,of,this,earthquake,may,allah,...,begovic,reserve,hattrick,ebolaoutbreak,ala,rajman,hasaka,risen,fasteners,xrwn
7558,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7559,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7560,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7561,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7562,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [112]:
y_score = clf.predict(bow_test_df.drop(["target_label","id"], axis=1))

In [116]:
bow_test_df['target'] = y_score

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  bow_test_df['target'] = y_score


In [117]:
bow_test_df[['id','target']].to_csv(data_folder.joinpath('submission_16102024.csv'))