In [None]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing
from gensim.models import Word2Vec

In [None]:
train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")

In [None]:
train_df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [None]:
test_df.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
print("non-disatser tweet: ", train_df[train_df["target"] == 0]["text"].values[0])
print('disaster tweet: ' , train_df[train_df["target"] == 1]["text"].values[0])

non-disatser tweet:  What's up man?
disaster tweet:  Our Deeds are the Reason of this #earthquake May ALLAH Forgive us all


In [None]:
count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"])

In [None]:
## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

(1, 21637)
[[0 0 0 ... 0 0 0]]


In [None]:
train_vectors = count_vectorizer.fit_transform(train_df["text"])
test_vectors = count_vectorizer.transform(test_df["text"])

In [None]:
## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression
## is a good way to do this.
clf = linear_model.RidgeClassifier()

In [None]:
scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

array([0.59453669, 0.5642787 , 0.64082434])

In [None]:
clf.fit(train_vectors, train_df["target"])

In [None]:
sample_submission = pd.read_csv("sample_submission.csv")
sample_submission["target"] = clf.predict(test_vectors)
sample_submission

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
sample_submission.to_csv("submission.csv", index=False)

In [None]:
import numpy as np
import pandas as pd
from sklearn import feature_extraction, linear_model, model_selection, preprocessing


train_df = pd.read_csv("train.csv")
test_df = pd.read_csv("test.csv")


count_vectorizer = feature_extraction.text.CountVectorizer()
## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])


train_vectors = count_vectorizer.fit_transform(train_df["text"])
## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors -
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])


## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression
## is a good way to do this.
clf = linear_model.RidgeClassifier()


scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
clf.fit(train_vectors, train_df["target"])


sample_submission = pd.read_csv("sample_submission.csv")
sample_submission["target"] = clf.predict(test_vectors)
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1


Word2Vector NLP

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score



In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [None]:
def preprocess_text(text):
    # Tokenize
    tokens = word_tokenize(text)

    # Remove punctuation and convert to lowercase
    tokens = [word.lower() for word in tokens if word.isalnum()]

    # Remove stopwords
    stop_words = set(stopwords.words("english"))
    tokens = [word for word in tokens if word not in stop_words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    return tokens

train_df["preprocessed_text"] = train_df["text"].apply(preprocess_text)

In [None]:
#Word2Vector

# Tokenized text data
tokenized_texts = train_df["preprocessed_text"].tolist()  # List of lists containing tokenized sentences

# Train Word2Vec
model_w2v = Word2Vec(sentences=tokenized_texts, vector_size=200, window=3, min_count=10, sg=1)

def get_sentence_embedding(token_list, model):
    vectors = [model.wv[word] for word in token_list if word in model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(model.vector_size)



In [None]:
scores

array([0.45388788, 0.50551315, 0.52976704, 0.47195013, 0.53378956])

In [None]:
# Calculate TF-IDF scores
tfidf_vectorizer = TfidfVectorizer(tokenizer=lambda x: x, lowercase=False)
tfidf_matrix = tfidf_vectorizer.fit_transform(tokenized_texts)

# Train Word2Vec
model_w2v = Word2Vec(sentences=tokenized_texts, vector_size=200, window=3, min_count=10, sg=1)

def get_weighted_sentence_embedding(token_list, model, tfidf_model):
    weighted_vectors = []
    for word in token_list:
        if word in model.wv and word in tfidf_model.vocabulary_:
            w2v_vector = model.wv[word]
            tfidf_score = tfidf_model.idf_[tfidf_model.vocabulary_[word]]
            weighted_vectors.append(w2v_vector * tfidf_score)

    if weighted_vectors:
        weighted_mean = np.mean(weighted_vectors, axis=0)
        return weighted_mean
    else:
        return np.zeros(model.vector_size)

# Get sentence embeddings using weighted Word2Vec and TF-IDF
train_embeddings_weighted = [get_weighted_sentence_embedding(sentence, model_w2v, tfidf_vectorizer) for sentence in tokenized_texts]

# Continue with your RandomForestClassifier and cross-validation
clf = RandomForestClassifier(n_estimators=100, random_state=42, n_jobs = -1)

# Cross-validation
scores = cross_val_score(clf, train_embeddings_weighted, train_df["target"], cv=5, scoring="f1")

# Fit RandomForestClassifier on the entire training data
clf.fit(train_embeddings_weighted, train_df["target"])




In [None]:
scores

array([0.60264317, 0.57624113, 0.61354582, 0.5106383 , 0.69045643])

In [None]:
test_df["preprocessed_text"] = test_df["text"].apply(preprocess_text)
test_embeddings = [get_weighted_sentence_embedding(sentence, model_w2v, tfidf_vectorizer) for sentence in test_df["preprocessed_text"]]
sample_submission["target"] = clf.predict(test_embeddings)
sample_submission

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1
