In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
import spacy 
from spacy.lang.en.stop_words import STOP_WORDS
from textblob import TextBlob
from textblob import Word
import nltk
import re
from bs4 import BeautifulSoup

In [3]:
df_train = pd.read_csv(r'D:\Excel files\kaggle_disastertweets\train.csv')
df_test = pd.read_csv(r'D:\Excel files\kaggle_disastertweets\test.csv')

In [4]:
df_train.drop(['keyword','location'],axis=1,inplace=True)
df_test.drop(['keyword','location'],axis=1,inplace=True)

In [5]:
df_train.shape

(7613, 3)

In [6]:
df_test.shape

(3263, 2)

In [7]:
df_train.head()

Unnamed: 0,id,text,target
0,1,Our Deeds are the Reason of this #earthquake M...,1
1,4,Forest fire near La Ronge Sask. Canada,1
2,5,All residents asked to 'shelter in place' are ...,1
3,6,"13,000 people receive #wildfires evacuation or...",1
4,7,Just got sent this photo from Ruby #Alaska as ...,1


# Feature Engineering(Adding some features)

In [8]:
def get_avg_word_len(x):
    words = x.split()
    word_len = 0
    for word in words:
        word_len = word_len+len(word)
    return word_len/len(words)

In [9]:
### 1.Word Counts
df_train['word_counts'] = df_train['text'].apply(lambda x: len(str(x).split()))
### 2.Char Counts
df_train['char_count'] = df_train['text'].apply(lambda x: len(x))
### 3 . Avg Word Length
df_train['avg_word_len'] = df_train['text'].apply(lambda x: get_avg_word_len(x))
### 4.Stop Words count
df_train['stop_words_count'] = df_train['text'].apply(lambda x: len([t for t in x.split() if t in STOP_WORDS]))
### 5.Hashtag and Mentions count
df_train['hashtag_count'] = df_train['text'].apply(lambda x: len([t for t in x.split() if t.startswith("#")]))
df_train['mentions_count'] = df_train['text'].apply(lambda x: len([t for t in x.split() if t.startswith("@")]))
### 6. Numeric count
df_train['numeric_count'] = df_train['text'].apply(lambda x: len([t for t in x.split() if t.isdigit()]))
### 7.Upper Case 
df_train['uppercase_count'] = df_train['text'].apply(lambda x: len([t for t in x.split() if t.isupper() and len(x)>3]))

In [10]:
df_train.head()

Unnamed: 0,id,text,target,word_counts,char_count,avg_word_len,stop_words_count,hashtag_count,mentions_count,numeric_count,uppercase_count
0,1,Our Deeds are the Reason of this #earthquake M...,1,13,69,4.384615,6,1,0,0,1
1,4,Forest fire near La Ronge Sask. Canada,1,7,38,4.571429,0,0,0,0,0
2,5,All residents asked to 'shelter in place' are ...,1,22,133,5.090909,9,0,0,0,0
3,6,"13,000 people receive #wildfires evacuation or...",1,8,65,7.125,1,1,0,0,0
4,7,Just got sent this photo from Ruby #Alaska as ...,1,16,88,4.5,6,2,0,0,0


# Text Cleaning

In [11]:
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how is",
"I'd": "I would",
"I'd've": "I would have",
"I'll": "I will",
"I'll've": "I will have",
"I'm": "i am",
"I've": "i have",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "I will",
"i'll've": "I will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so as",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that has",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
"we'd": "we would",
"we'd've": "we would have",
"we'll": "we will",
"we'll've": "we will have",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what'll've": "what will have",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"when's": "when is",
"when've": "when have",
"where'd": "where did",
"where's": "where is",
"where've": "where have",
"who'll": "who will",
"who'll've": "who will have",
"won't": "will not",
"won't've": "will not have",
"would've": "would have",
"wouldn't": "would not",
"wouldn't've": "would not have",
"y'all": "you all",
"y'all'd": "you all would",
"y'all'd've": "you all would have",
"y'all're": "you all are",
"y'all've": "you all have",
"you'd": "you would",
"you'd've": "you would have",
"you'll": "you will",
"you'll've": "you will have",
"you're": "you are",
"you've": "you have",
"u":"you",
" ur ":" your ",
" n ":" and ",
'bout':'about',
"cn":"can",
"hve":"have"
}

In [12]:
def cont_to_exp(x):
    if type(x) is str:
        for key in contractions:
            value = contractions[key]
            x = x.replace(key,value)
        return x
    else:
        return x

In [13]:
import unicodedata
def remove_accented_char(x):
    x = unicodedata.normalize('NFKD',x).encode('ascii','ignore').decode('utf-8','ignore')
    return x

In [14]:
import string
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [15]:
## 1.lower
df_train['text'] = df_train['text'].apply(lambda x: x.lower())
## 2.cont to exp
df_train['text'] = df_train['text'].apply(lambda x: cont_to_exp(x))
## 3.acc char removal
df_train['text'] = df_train['text'].apply(lambda x: remove_accented_char(x))
## 4.Remove Emails
df_train['text'] = df_train['text'].apply(lambda x: re.sub(r'([a-zA-z0-9+._-]+@[a-zA-z0-9._-]+\.[a-zA-z0-9_-]+)','' , x))
## 5.Punctuation Removal
df_train['text'] = df_train['text'].apply(lambda x : "".join(i for i in x if i not in string.punctuation))
## 6.Remove RT
df_train['text'] = df_train['text'].apply(lambda x: re.sub('RT',"",x))
## 7. Remove multiple spaces
df_train['text'] = df_train['text'].apply(lambda x : " ".join(x.split()))
## 8.HTML TAGS removal
df_train['text'] = df_train['text'].apply(lambda x : BeautifulSoup(x,'lxml').get_text())
## 9.Remove Stopwords
df_train['text'] = df_train['text'].apply(lambda x : "".join(i for i in x if i not in STOP_WORDS))
## 10.Lemmatize
df_train['text'] = df_train['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))

In [16]:
## removing url
df_train['text'] = df_train['text'].replace(r'http\S+','',regex=True).replace(r'www\S+','',regex=True)

In [17]:
from collections import Counter
cnt = Counter()
for text in df_train['text'].values:
    for word in text.split():
        cnt[word]+=1
cnt.most_common(10)

[('the', 3263),
 ('n', 2252),
 ('to', 1945),
 ('of', 1825),
 ('s', 1783),
 ('nd', 1484),
 ('t', 1402),
 ('yoyou', 890),
 ('for', 890),
 ('on', 854)]

In [18]:
## removing most frequent words(10)
freqwords = set([w for (w,wc) in cnt.most_common(10)])
def remove_freqwords(text):
    return " ".join([word for word in str(text).split() if word not in freqwords])

df_train['text'] = df_train['text'].apply(lambda text : remove_freqwords(text))

In [19]:
## Rare Words Removal
text = ' '.join(df_train['text'])
text = text.split()
freq_word = pd.Series(text).value_counts()

In [20]:
#rare20 = freq_word[-20:]
#rare20
rare = freq_word[freq_word.values==1]
df_train['text'] = df_train['text'].apply(lambda x:' '.join([t for t in x.split() if t not in rare]))

In [21]:
df_train.head()

Unnamed: 0,id,text,target,word_counts,char_count,avg_word_len,stop_words_count,hashtag_count,mentions_count,numeric_count,uppercase_count
0,1,oyour deed re reson th erthqyouke my llh forgv...,1,13,69,4.384615,6,1,0,0,1
1,4,forest fre ner l cnd,1,7,38,4.571429,0,0,0,0,0
2,5,ll resdents sked shelter plce re beng by offce...,1,22,133,5.090909,9,0,0,0,0
3,6,13000 people receve wldfres evcyouton order cl...,1,8,65,7.125,1,1,0,0,0
4,7,jyoust got sent th photo from lsk smoke from w...,1,16,88,4.5,6,2,0,0,0


In [22]:
# check some processed reviews
import random

i = random.choice(range(len(df_train)))
print(f"Processed review: \n{df_train['text'].iloc[i]}")

Processed review: 
mges fmne uo hope chrst blog wht hppens when we forget god


# Now applying same steps on test data

In [23]:
df_test.head()

Unnamed: 0,id,text
0,0,Just happened a terrible car crash
1,2,"Heard about #earthquake is different cities, s..."
2,3,"there is a forest fire at spot pond, geese are..."
3,9,Apocalypse lighting. #Spokane #wildfires
4,11,Typhoon Soudelor kills 28 in China and Taiwan


In [24]:
### 1.Word Counts
df_test['word_counts'] = df_test['text'].apply(lambda x: len(str(x).split()))
### 2.Char Counts
df_test['char_count'] = df_test['text'].apply(lambda x: len(x))
### 3 . Avg Word Length
df_test['avg_word_len'] = df_test['text'].apply(lambda x: get_avg_word_len(x))
### 4.Stop Words count
df_test['stop_words_count'] = df_test['text'].apply(lambda x: len([t for t in x.split() if t in STOP_WORDS]))
### 5.Hashtag and Mentions count
df_test['hashtag_count'] = df_test['text'].apply(lambda x: len([t for t in x.split() if t.startswith("#")]))
df_test['mentions_count'] = df_test['text'].apply(lambda x: len([t for t in x.split() if t.startswith("@")]))
### 6. Numeric count
df_test['numeric_count'] = df_test['text'].apply(lambda x: len([t for t in x.split() if t.isdigit()]))
### 7.Upper Case 
df_test['uppercase_count'] = df_test['text'].apply(lambda x: len([t for t in x.split() if t.isupper() and len(x)>3]))

In [25]:
### cleaning
## 1.lower
df_test['text'] = df_test['text'].apply(lambda x: x.lower())
## 2.cont to exp
df_test['text'] = df_test['text'].apply(lambda x: cont_to_exp(x))
## 3.acc char removal
df_test['text'] = df_test['text'].apply(lambda x: remove_accented_char(x))
## 4.Remove Emails
df_test['text'] = df_test['text'].apply(lambda x: re.sub(r'([a-zA-z0-9+._-]+@[a-zA-z0-9._-]+\.[a-zA-z0-9_-]+)','' , x))
## 5.Punctuation Removal
df_test['text'] = df_test['text'].apply(lambda x : "".join(i for i in x if i not in string.punctuation))
## 6.Remove RT
df_test['text'] = df_test['text'].apply(lambda x: re.sub('RT',"",x))
## 7. Remove multiple spaces
df_test['text'] = df_test['text'].apply(lambda x : " ".join(x.split()))
## 8.HTML TAGS removal
df_test['text'] = df_test['text'].apply(lambda x : BeautifulSoup(x,'lxml').get_text())
## 9.Remove Stopwords
df_test['text'] = df_test['text'].apply(lambda x : "".join(i for i in x if i not in STOP_WORDS))
## 10.Lemmatize
df_test['text'] = df_test['text'].apply(lambda x: " ".join([Word(word).lemmatize() for word in x.split()]))
## 11.removing urls
df_test['text'] = df_test['text'].replace(r'http\S+','',regex=True).replace(r'www\S+','',regex=True)
## 12.removing most freq words
df_test['text'] = df_test['text'].apply(lambda text : remove_freqwords(text))

In [26]:
text = ' '.join(df_test['text'])
text = text.split()
freq_word = pd.Series(text).value_counts()
rare = freq_word[freq_word.values==1]
df_test['text'] = df_test['text'].apply(lambda x:' '.join([t for t in x.split() if t not in rare]))

In [27]:
df_test.head()

Unnamed: 0,id,text,word_counts,char_count,avg_word_len,stop_words_count,hashtag_count,mentions_count,numeric_count,uppercase_count
0,0,jyoust hppened terrble cr crsh,6,34,4.833333,1,0,0,0,0
1,2,herd boyout erthqyouke dfferent ctes sty sfe e...,9,64,6.222222,2,1,0,0,0
2,3,there forest fre spot pond re cross street cnn...,19,96,4.105263,10,0,0,0,1
3,9,poclypse lghtng wldfres,4,40,9.25,0,2,0,0,0
4,11,typhoon soyoudelor klls 28 chn twn,8,45,4.75,2,0,0,1,0


Data is Cleaned

# Model Building

## Model1 - Bag of Words

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.metrics import accuracy_score
import lightgbm as LGBMClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import MinMaxScaler

In [29]:
y = df_train['target']

In [30]:
cv = CountVectorizer()
text_counts = cv.fit_transform(df_train['text'])
df_train_bow = pd.DataFrame(text_counts.toarray(),columns = cv.get_feature_names())
df_train_bow.head()

Unnamed: 0,001116,005225,0104,010401,015025,02,0306,05,06,06jst,...,zmbbwe,zombe,zone,zonsm,zonst,zoyoum,zppednews,zrry,zyn,zynmlk
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
X=df_train_bow
y=df_train['target']

## Xgboost

In [32]:
xgb = XGBClassifier(random_state=10)

xgb.fit(X,y)
y_pred = xgb.predict(X)
score = xgb.score(X,y)
print(score)

0.8389596742414291


In [33]:
test_vectorizer =cv.transform(df_test['text']).toarray()

In [34]:
test_vectorizer.shape

(3263, 6218)

In [35]:
###############################        Making Predictions       #################################
y_pred = xgb.predict(test_vectorizer)

In [36]:
#### model1 with xgboost for submission 
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission1.csv',index=False)
output

Unnamed: 0,id,Target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [71]:
#### kaggle score with xgboost

#449 -rank
#0.78731-score

# Lightgbm

In [52]:
import lightgbm as lgb

In [38]:
lightgbm = lgb.LGBMClassifier(random_state=10)

lightgbm.fit(X,y)
y_pred = lightgbm.predict(X)
score = lightgbm.score(X,y)
print(score)

0.8407986339156706


In [39]:
###############################        Making Predictions with lightgbm    #################################
y_pred = lightgbm.predict(test_vectorizer)
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission2.csv',index=False)
output

Unnamed: 0,id,Target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [79]:
#### kaggle score with lightgbm

#447 -rank
#0.78792-score

# Model 2 - Tfidf Vectorizer

In [40]:
tfidf = TfidfVectorizer()
text_counts = tfidf.fit_transform(df_train['text'])
df_train_tfidf = pd.DataFrame(text_counts.toarray(),columns = tfidf.get_feature_names())
df_train_tfidf.head()

Unnamed: 0,001116,005225,0104,010401,015025,02,0306,05,06,06jst,...,zmbbwe,zombe,zone,zonsm,zonst,zoyoum,zppednews,zrry,zyn,zynmlk
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
X=df_train_tfidf
y=df_train['target']

# lightgbm

In [42]:
lightgbm = lgb.LGBMClassifier(random_state=10)

lightgbm.fit(X,y)
y_pred = lightgbm.predict(X)
score = lightgbm.score(X,y)
print(score)

0.8582687508209641


In [43]:
test_vectorizer = tfidf.transform(df_test['text']).toarray()

In [44]:
###############################        tfidf - Making Predictions with lightgbm     #################################
y_pred = lightgbm.predict(test_vectorizer)
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission3.csv',index=False)
output

Unnamed: 0,id,Target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
## 0.78669 - not a improvement

# xgboost

In [45]:
xgb = XGBClassifier(random_state=10)

xgb.fit(X,y)
y_pred = xgb.predict(X)
score = xgb.score(X,y)
print(score)

0.861289898857218


In [46]:
###############################        tfidf - Making Predictions with xgboost     #################################
y_pred = xgb.predict(test_vectorizer)
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission4.csv',index=False)
output

Unnamed: 0,id,Target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
## 0.77903 - not a improvement

# Model type 3 :- Bag of Words+Manual Features

In [32]:
df_train_feat = df_train.drop(['text','target','id'],axis=1).reset_index(drop=True)
df_test_feat = df_test.drop(['text'],axis=1).reset_index(drop=True)

In [33]:
X = df_train_feat.join(df_train_bow)
X.head()

Unnamed: 0,word_counts,char_count,avg_word_len,stop_words_count,hashtag_count,mentions_count,numeric_count,uppercase_count,001116,005225,...,zmbbwe,zombe,zone,zonsm,zonst,zoyoum,zppednews,zrry,zyn,zynmlk
0,13,69,4.384615,6,1,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7,38,4.571429,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,22,133,5.090909,9,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,8,65,7.125,1,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,16,88,4.5,6,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## XGBoost

In [54]:
xgb = XGBClassifier(random_state=10)

xgb.fit(X,y)
y_pred = xgb.predict(X)
score = xgb.score(X,y)
print(score)

0.8627347957441219


In [50]:
df_test_feat.head(2)

Unnamed: 0,id,word_counts,char_count,avg_word_len,stop_words_count,hashtag_count,mentions_count,numeric_count,uppercase_count,Target
0,0,6,34,4.833333,1,0,0,0,0,1
1,2,9,64,6.222222,2,1,0,0,0,1


In [35]:
df_test_feat

Unnamed: 0,id,word_counts,char_count,avg_word_len,stop_words_count,hashtag_count,mentions_count,numeric_count,uppercase_count
0,0,6,34,4.833333,1,0,0,0,0
1,2,9,64,6.222222,2,1,0,0,0
2,3,19,96,4.105263,10,0,0,0,1
3,9,4,40,9.250000,0,2,0,0,0
4,11,8,45,4.750000,2,0,0,1,0
...,...,...,...,...,...,...,...,...,...
3258,10861,8,55,6.000000,0,0,0,0,7
3259,10865,23,139,5.086957,7,0,0,0,2
3260,10868,6,55,8.333333,1,0,0,0,0
3261,10874,7,65,8.428571,0,0,0,0,2


In [34]:
test_vectorizer =cv.transform(df_test['text']).toarray()
df_test_bow = pd.DataFrame(test_vectorizer)
x = df_test_feat.drop(['id'],axis=1).join(df_test_bow)
x.head()

Unnamed: 0,word_counts,char_count,avg_word_len,stop_words_count,hashtag_count,mentions_count,numeric_count,uppercase_count,0,1,...,6208,6209,6210,6211,6212,6213,6214,6215,6216,6217
0,6,34,4.833333,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,64,6.222222,2,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,19,96,4.105263,10,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,40,9.25,0,2,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8,45,4.75,2,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [52]:
###############################         Making Predictions with xgboost+BOW+manual Feat     #################################
y_pred = xgb.predict(x)
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission5.csv',index=False)
output

Unnamed: 0,id,Target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
#Your submission scored 0.78240, which is not an improvement of your previous score. Keep trying

# lightgbm

In [53]:
lightgbm = lgb.LGBMClassifier(random_state=10)

lightgbm.fit(X,y)
y_pred = lightgbm.predict(X)
score = lightgbm.score(X,y)
print(score)

0.8514383291737817


In [None]:
test_vectorizer =cv.transform(df_test['text']).toarray()
df_test_bow = pd.DataFrame(test_vectorizer)
x = df_test_feat.drop(['id','Target'],axis=1).join(df_test_bow)
x.head()

In [55]:
###############################        Making Predictions with lightgbm+BOW+manual Feat     #################################
y_pred = lightgbm.predict(x)
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission6.csv',index=False)
output

Unnamed: 0,id,Target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
#Your most recent submission scored 0.79313, which is an improvement of your previous score of 0.78792. Great job!

In [35]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier

## Adaboost

In [39]:
### Adaboost

ada = AdaBoostClassifier(n_estimators=200)
ada.fit(X,y)
y_pred = ada.predict(X)
score = ada.score(X,y)
print(score)

0.8176802837252068


In [40]:
###############################    Making Predictions with adaboost+BOW+manual Feat            #################################
y_pred = ada.predict(x)
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission10.csv',index=False)
output

Unnamed: 0,id,Target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


## GradientBoosting

In [42]:
gb = GradientBoostingClassifier(n_estimators=200)
gb.fit(X, y)
y_pred = gb.predict(X)
score = gb.score(X,y)
print(score)

0.8210954945487982


In [43]:
###############################    Making Predictions with GradientBoosting+BOW+manual Feat            #################################
y_pred = gb.predict(x)
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission11.csv',index=False)
output

Unnamed: 0,id,Target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,0


## Cat Boost Classifier

In [47]:
cat = CatBoostClassifier(iterations=1000,random_state=68)
cat.fit(X,y)
y_pred = cat.predict(X)
score = cat.score(X,y)
print(score)

Learning rate set to 0.024511
0:	learn: 0.6897013	total: 20.4ms	remaining: 20.4s
1:	learn: 0.6859787	total: 39.7ms	remaining: 19.8s
2:	learn: 0.6830148	total: 58.9ms	remaining: 19.6s
3:	learn: 0.6799288	total: 77.9ms	remaining: 19.4s
4:	learn: 0.6773233	total: 97.2ms	remaining: 19.3s
5:	learn: 0.6742763	total: 116ms	remaining: 19.2s
6:	learn: 0.6719500	total: 139ms	remaining: 19.7s
7:	learn: 0.6693434	total: 159ms	remaining: 19.7s
8:	learn: 0.6667870	total: 178ms	remaining: 19.6s
9:	learn: 0.6645575	total: 197ms	remaining: 19.5s
10:	learn: 0.6620820	total: 216ms	remaining: 19.4s
11:	learn: 0.6598284	total: 235ms	remaining: 19.3s
12:	learn: 0.6575374	total: 254ms	remaining: 19.3s
13:	learn: 0.6556198	total: 273ms	remaining: 19.2s
14:	learn: 0.6537604	total: 292ms	remaining: 19.1s
15:	learn: 0.6521449	total: 311ms	remaining: 19.1s
16:	learn: 0.6500445	total: 329ms	remaining: 19.1s
17:	learn: 0.6482148	total: 348ms	remaining: 19s
18:	learn: 0.6464042	total: 370ms	remaining: 19.1s
19:	lear

161:	learn: 0.5466714	total: 5.93s	remaining: 30.7s
162:	learn: 0.5463303	total: 5.97s	remaining: 30.7s
163:	learn: 0.5460382	total: 6.01s	remaining: 30.6s
164:	learn: 0.5456876	total: 6.05s	remaining: 30.6s
165:	learn: 0.5454267	total: 6.08s	remaining: 30.6s
166:	learn: 0.5450708	total: 6.13s	remaining: 30.6s
167:	learn: 0.5447925	total: 6.17s	remaining: 30.5s
168:	learn: 0.5445490	total: 6.21s	remaining: 30.5s
169:	learn: 0.5442232	total: 6.25s	remaining: 30.5s
170:	learn: 0.5439747	total: 6.28s	remaining: 30.5s
171:	learn: 0.5436761	total: 6.32s	remaining: 30.4s
172:	learn: 0.5433846	total: 6.36s	remaining: 30.4s
173:	learn: 0.5431682	total: 6.4s	remaining: 30.4s
174:	learn: 0.5429004	total: 6.44s	remaining: 30.4s
175:	learn: 0.5426209	total: 6.48s	remaining: 30.3s
176:	learn: 0.5422776	total: 6.52s	remaining: 30.3s
177:	learn: 0.5419914	total: 6.56s	remaining: 30.3s
178:	learn: 0.5417144	total: 6.6s	remaining: 30.3s
179:	learn: 0.5413944	total: 6.64s	remaining: 30.2s
180:	learn: 0.

321:	learn: 0.5074979	total: 12.3s	remaining: 26s
322:	learn: 0.5072627	total: 12.4s	remaining: 25.9s
323:	learn: 0.5070466	total: 12.4s	remaining: 25.9s
324:	learn: 0.5068952	total: 12.5s	remaining: 25.9s
325:	learn: 0.5067546	total: 12.5s	remaining: 25.8s
326:	learn: 0.5065361	total: 12.5s	remaining: 25.8s
327:	learn: 0.5062111	total: 12.6s	remaining: 25.8s
328:	learn: 0.5060167	total: 12.6s	remaining: 25.7s
329:	learn: 0.5058138	total: 12.6s	remaining: 25.7s
330:	learn: 0.5056306	total: 12.7s	remaining: 25.6s
331:	learn: 0.5053634	total: 12.7s	remaining: 25.6s
332:	learn: 0.5051936	total: 12.8s	remaining: 25.6s
333:	learn: 0.5049622	total: 12.8s	remaining: 25.5s
334:	learn: 0.5047921	total: 12.8s	remaining: 25.5s
335:	learn: 0.5045571	total: 12.9s	remaining: 25.5s
336:	learn: 0.5043550	total: 12.9s	remaining: 25.4s
337:	learn: 0.5041290	total: 13s	remaining: 25.4s
338:	learn: 0.5039129	total: 13s	remaining: 25.4s
339:	learn: 0.5037395	total: 13s	remaining: 25.3s
340:	learn: 0.503558

483:	learn: 0.4725287	total: 19s	remaining: 20.2s
484:	learn: 0.4723537	total: 19s	remaining: 20.2s
485:	learn: 0.4721131	total: 19.1s	remaining: 20.2s
486:	learn: 0.4719056	total: 19.1s	remaining: 20.1s
487:	learn: 0.4717029	total: 19.2s	remaining: 20.1s
488:	learn: 0.4715599	total: 19.2s	remaining: 20.1s
489:	learn: 0.4713865	total: 19.2s	remaining: 20s
490:	learn: 0.4710980	total: 19.3s	remaining: 20s
491:	learn: 0.4709391	total: 19.3s	remaining: 20s
492:	learn: 0.4707266	total: 19.4s	remaining: 19.9s
493:	learn: 0.4705502	total: 19.4s	remaining: 19.9s
494:	learn: 0.4702755	total: 19.5s	remaining: 19.9s
495:	learn: 0.4700506	total: 19.5s	remaining: 19.8s
496:	learn: 0.4698640	total: 19.6s	remaining: 19.8s
497:	learn: 0.4696760	total: 19.6s	remaining: 19.8s
498:	learn: 0.4694340	total: 19.6s	remaining: 19.7s
499:	learn: 0.4692580	total: 19.7s	remaining: 19.7s
500:	learn: 0.4690477	total: 19.7s	remaining: 19.7s
501:	learn: 0.4688485	total: 19.8s	remaining: 19.6s
502:	learn: 0.4685935	

642:	learn: 0.4439062	total: 25.5s	remaining: 14.2s
643:	learn: 0.4437345	total: 25.5s	remaining: 14.1s
644:	learn: 0.4435405	total: 25.6s	remaining: 14.1s
645:	learn: 0.4433155	total: 25.6s	remaining: 14s
646:	learn: 0.4431410	total: 25.6s	remaining: 14s
647:	learn: 0.4429981	total: 25.7s	remaining: 14s
648:	learn: 0.4428110	total: 25.7s	remaining: 13.9s
649:	learn: 0.4426156	total: 25.8s	remaining: 13.9s
650:	learn: 0.4424308	total: 25.8s	remaining: 13.8s
651:	learn: 0.4422824	total: 25.8s	remaining: 13.8s
652:	learn: 0.4421465	total: 25.9s	remaining: 13.8s
653:	learn: 0.4419851	total: 25.9s	remaining: 13.7s
654:	learn: 0.4418638	total: 26s	remaining: 13.7s
655:	learn: 0.4416585	total: 26s	remaining: 13.6s
656:	learn: 0.4415128	total: 26.1s	remaining: 13.6s
657:	learn: 0.4414352	total: 26.1s	remaining: 13.6s
658:	learn: 0.4412922	total: 26.1s	remaining: 13.5s
659:	learn: 0.4411279	total: 26.2s	remaining: 13.5s
660:	learn: 0.4409889	total: 26.2s	remaining: 13.4s
661:	learn: 0.4408280	

804:	learn: 0.4207971	total: 32s	remaining: 7.75s
805:	learn: 0.4206863	total: 32s	remaining: 7.71s
806:	learn: 0.4206430	total: 32.1s	remaining: 7.67s
807:	learn: 0.4205953	total: 32.1s	remaining: 7.63s
808:	learn: 0.4204784	total: 32.2s	remaining: 7.59s
809:	learn: 0.4203617	total: 32.2s	remaining: 7.55s
810:	learn: 0.4202020	total: 32.2s	remaining: 7.51s
811:	learn: 0.4200784	total: 32.3s	remaining: 7.47s
812:	learn: 0.4200132	total: 32.3s	remaining: 7.43s
813:	learn: 0.4199734	total: 32.4s	remaining: 7.4s
814:	learn: 0.4198055	total: 32.4s	remaining: 7.36s
815:	learn: 0.4197242	total: 32.5s	remaining: 7.32s
816:	learn: 0.4196274	total: 32.5s	remaining: 7.28s
817:	learn: 0.4195766	total: 32.5s	remaining: 7.24s
818:	learn: 0.4193904	total: 32.6s	remaining: 7.2s
819:	learn: 0.4191945	total: 32.6s	remaining: 7.16s
820:	learn: 0.4190781	total: 32.7s	remaining: 7.12s
821:	learn: 0.4189911	total: 32.7s	remaining: 7.08s
822:	learn: 0.4188618	total: 32.8s	remaining: 7.04s
823:	learn: 0.4186

964:	learn: 0.4026233	total: 38.9s	remaining: 1.41s
965:	learn: 0.4024695	total: 38.9s	remaining: 1.37s
966:	learn: 0.4023399	total: 38.9s	remaining: 1.33s
967:	learn: 0.4022320	total: 39s	remaining: 1.29s
968:	learn: 0.4020939	total: 39s	remaining: 1.25s
969:	learn: 0.4019853	total: 39.1s	remaining: 1.21s
970:	learn: 0.4018608	total: 39.1s	remaining: 1.17s
971:	learn: 0.4017100	total: 39.1s	remaining: 1.13s
972:	learn: 0.4016047	total: 39.2s	remaining: 1.09s
973:	learn: 0.4014986	total: 39.2s	remaining: 1.05s
974:	learn: 0.4013868	total: 39.2s	remaining: 1.01s
975:	learn: 0.4012376	total: 39.3s	remaining: 966ms
976:	learn: 0.4010829	total: 39.3s	remaining: 926ms
977:	learn: 0.4010068	total: 39.4s	remaining: 886ms
978:	learn: 0.4009266	total: 39.4s	remaining: 845ms
979:	learn: 0.4007804	total: 39.5s	remaining: 805ms
980:	learn: 0.4006864	total: 39.5s	remaining: 765ms
981:	learn: 0.4006248	total: 39.5s	remaining: 725ms
982:	learn: 0.4005303	total: 39.6s	remaining: 684ms
983:	learn: 0.40

## Extra Trees Classifier

In [49]:
etc = ExtraTreesClassifier(n_estimators=200,random_state=68)
etc.fit(X,y)
y_pred = etc.predict(X)
score = etc.score(X,y)
print(score)

0.9888348876921056


In [50]:
###############################    Making Predictions with extra tree +BOW+manual Feat            #################################
y_pred = etc.predict(x)
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission13.csv',index=False)
output

Unnamed: 0,id,Target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
## Your most recent submission scored 0.79926, which is an improvement of your previous score of 0.79313. Great job!

In [63]:
%%time
etc = ExtraTreesClassifier(n_estimators=1000,max_depth=80,random_state=68)
etc.fit(X,y)
y_pred = etc.predict(X)
score = etc.score(X,y)
print(score)

0.9143570208853278
CPU times: total: 5min 37s
Wall time: 5min 38s


In [64]:
###############################    Making Predictions with extra tree optimised +BOW+manual Feat            #################################
y_pred = etc.predict(x)
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission16.csv',index=False)
output

Unnamed: 0,id,Target
0,0,0
1,2,0
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
##-- not a improvement

# Voting Classifier

In [55]:
from sklearn.ensemble import VotingClassifier

In [56]:
classifiers = [('Gradient Boosting Classifier', gb), ('XGboost', xgb),
               ('Extra Tree', etc), ('Light Gradient', lightgbm), ('Ada Boost', ada)]
vc = VotingClassifier(estimators = classifiers,voting='soft')
vc.fit(X, y)
y_pred = vc.predict(X)
score = vc.score(X,y)
print(score)

0.9432549586234074


In [57]:
###############################    Making Predictions with voting classifier +BOW+manual Feat            #################################
y_pred = vc.predict(x)
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission14.csv',index=False)
output

Unnamed: 0,id,Target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
##Your submission scored 0.79711, which is not an improvement of your previous score. Keep trying!

In [None]:
##hard voting

In [58]:
classifiers = [('Gradient Boosting Classifier', gb), ('XGboost', xgb),
               ('Extra Tree', etc), ('Light Gradient', lightgbm), ('Ada Boost', ada)]
vc = VotingClassifier(estimators = classifiers)
vc.fit(X, y)
y_pred = vc.predict(X)
score = vc.score(X,y)
print(score)

0.8652305267305924


In [59]:
###############################    Making Predictions with voting classifier-hard + BOW+manual Feat            #################################
y_pred = vc.predict(x)
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission15.csv',index=False)
output

Unnamed: 0,id,Target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
## Your submission scored 0.79313, which is not an improvement of your previous score. Keep trying!

# Optimising Lightgbm

In [35]:
from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
from hyperopt import hp,fmin,tpe,STATUS_OK,Trials
from sklearn.model_selection import cross_val_score

In [87]:
space = {'boosting_type': hp.choice('boosting_type', ['gbdt','dart']),
        'max_depth': hp.choice('max_depth', [10,30,50,70,90,110,120,140]),
        'learning_rate': hp.choice('learning_rate', [0.05,0.1,0.001]),
        'min_child_weight': hp.choice('min_child_weight',[1e-3,1e-2]),
        'num_leaves' : hp.choice('num_leaves',[10,30,50,70,90,110,120,140]),
        'n_estimators' : hp.choice('n_estimators', [10, 50, 300, 750, 1200,1300,1500])
    }

In [88]:
space

{'boosting_type': <hyperopt.pyll.base.Apply at 0x1f8e09eb190>,
 'max_depth': <hyperopt.pyll.base.Apply at 0x1f8e09e77c0>,
 'learning_rate': <hyperopt.pyll.base.Apply at 0x1f8e09e7e50>,
 'min_child_weight': <hyperopt.pyll.base.Apply at 0x1f8e09e7d00>,
 'num_leaves': <hyperopt.pyll.base.Apply at 0x1f8e09e7bb0>,
 'n_estimators': <hyperopt.pyll.base.Apply at 0x1f8e09e7580>}

In [89]:
def objective(space):
    model = lgb.LGBMClassifier(boosting_type = space['boosting_type'], max_depth = space['max_depth'],
                                 learning_rate = space['learning_rate'],
                                 min_child_weight = space['min_child_weight'],
                                 num_leaves = space['num_leaves'],
                                 n_estimators = space['n_estimators'], 
                                 )
    
    accuracy = cross_val_score(model, X, y, cv = 5).mean()

    # We aim to maximize accuracy, therefore we return it as a negative value
    return {'loss': -accuracy, 'status': STATUS_OK }

In [90]:
from sklearn.model_selection import cross_val_score
trials = Trials()
best = fmin(fn= objective,
            space= space,
            algo= tpe.suggest,
            max_evals = 80,
            trials= trials)
best

100%|████████████████████████████████████████████| 80/80 [2:23:55<00:00, 107.94s/trial, best loss: -0.7020949039821296]


{'boosting_type': 0,
 'learning_rate': 1,
 'max_depth': 7,
 'min_child_weight': 1,
 'n_estimators': 1,
 'num_leaves': 3}

In [91]:
boosting_type = {0: 'gbdt', 1: 'dart'}
max_depth = {0:10,1:30,2:50,3:70,4:90,5:110,6:120,7:140}
learning_rate = {0:0.05,1:0.1,2:0.001}
min_child_weight={0:1e-3,1:1e-2}
num_leaves = {0:10,1:30,2:50,3:70,4:90,5:110,6:120,7:140}
estimators = {0:10, 1:50, 2:300, 3:750, 4:1200,5:1300,6:1500}

print(boosting_type[best['boosting_type']])
print(max_depth[best['max_depth']])
print(learning_rate[best['learning_rate']])
print(min_child_weight[best['min_child_weight']])
print(num_leaves[best['num_leaves']])
print(estimators[best['n_estimators']])

gbdt
140
0.1
0.01
70
50


In [93]:
lgbm = lgb.LGBMClassifier(boosting_type = boosting_type[best['boosting_type']], max_depth = max_depth[best['max_depth']], 
                                       learning_rate = learning_rate[best['learning_rate']], 
                                       min_child_weight = min_child_weight[best['min_child_weight']], 
                                       num_leaves = num_leaves[best['num_leaves']], 
                                       n_estimators = estimators[best['n_estimators']]).fit(X,y)

In [94]:
score = lgbm.score(X,y)
print(score)

0.8580060422960725


In [96]:
###############################    Making Predictions with optimised lightgbm+BOW+manual Feat     #################################
y_pred = lgbm.predict(x)
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission7.csv',index=False)
output

Unnamed: 0,id,Target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,0
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [97]:
## Your submission scored 0.78823, which is not an improvement of your previous score. Keep trying!

# Optimisation using Optuna

In [35]:
import optuna
import sklearn
from sklearn.model_selection import cross_val_score
import lightgbm as lgb
import optuna

In [53]:
import optuna
def objective(trial):
    param = {
        'metric': 'rmse', 
        'random_state': 48,
        'n_estimators': trial.suggest_int('n_estimators', 500, 2000,3),
        'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-3, 10.0),
        'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-3, 10.0),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7]),
        'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.006,0.008,0.01,0.014]),
        'max_depth': trial.suggest_categorical('max_depth', [10,20,100]),
        'num_leaves' : trial.suggest_int('num_leaves', 1, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 300),
        'cat_smooth' : trial.suggest_int('min_data_per_groups', 1, 100)
    }
        
    model = lgb.LGBMClassifier(**param)  
    
    return sklearn.model_selection.cross_val_score(
         model,X,y,n_jobs=-1,cv=3).mean()

In [54]:
%%time
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

trial = study.best_trial

print('Accuracy: {}'.format(trial.value))
print("Best hyperparameters: {}".format(trial.params))

[32m[I 2022-09-22 15:35:20,252][0m A new study created in memory with name: no-name-8892496f-cc2e-4ee8-8def-4f31a413d05f[0m
[32m[I 2022-09-22 15:35:37,647][0m Trial 0 finished with value: 0.6587430639097179 and parameters: {'n_estimators': 1460, 'reg_alpha': 0.00876060270618543, 'reg_lambda': 0.035049772542504916, 'colsample_bytree': 0.5, 'subsample': 0.5, 'learning_rate': 0.01, 'max_depth': 10, 'num_leaves': 191, 'min_child_samples': 133, 'min_data_per_groups': 13}. Best is trial 0 with value: 0.6587430639097179.[0m
[32m[I 2022-09-22 15:36:22,270][0m Trial 1 finished with value: 0.6502040356959189 and parameters: {'n_estimators': 1862, 'reg_alpha': 0.009896077336217862, 'reg_lambda': 0.645459614305281, 'colsample_bytree': 0.4, 'subsample': 0.7, 'learning_rate': 0.008, 'max_depth': 10, 'num_leaves': 890, 'min_child_samples': 277, 'min_data_per_groups': 60}. Best is trial 0 with value: 0.6587430639097179.[0m
[32m[I 2022-09-22 15:36:57,682][0m Trial 2 finished with value: 0.66

[32m[I 2022-09-22 15:45:56,586][0m Trial 21 finished with value: 0.7068195539221517 and parameters: {'n_estimators': 1982, 'reg_alpha': 8.510572042718264, 'reg_lambda': 0.002895220501501877, 'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.006, 'max_depth': 100, 'num_leaves': 426, 'min_child_samples': 3, 'min_data_per_groups': 36}. Best is trial 9 with value: 0.7111545035756074.[0m
[32m[I 2022-09-22 15:46:06,215][0m Trial 22 finished with value: 0.698544908094636 and parameters: {'n_estimators': 1844, 'reg_alpha': 9.383758095514622, 'reg_lambda': 0.00506901746371911, 'colsample_bytree': 0.3, 'subsample': 0.4, 'learning_rate': 0.006, 'max_depth': 100, 'num_leaves': 456, 'min_child_samples': 26, 'min_data_per_groups': 33}. Best is trial 9 with value: 0.7111545035756074.[0m
[32m[I 2022-09-22 15:46:26,385][0m Trial 23 finished with value: 0.7064260088075004 and parameters: {'n_estimators': 1607, 'reg_alpha': 3.157212358894155, 'reg_lambda': 0.0032451937709983805, 'cols

[32m[I 2022-09-22 15:51:39,331][0m Trial 42 finished with value: 0.691057611339566 and parameters: {'n_estimators': 710, 'reg_alpha': 5.605799860300203, 'reg_lambda': 0.12206645850402092, 'colsample_bytree': 0.3, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 100, 'num_leaves': 820, 'min_child_samples': 50, 'min_data_per_groups': 6}. Best is trial 31 with value: 0.712729771175414.[0m
[32m[I 2022-09-22 15:52:37,383][0m Trial 43 finished with value: 0.7077396895269684 and parameters: {'n_estimators': 884, 'reg_alpha': 1.5422700621018324, 'reg_lambda': 0.42944520858823565, 'colsample_bytree': 0.3, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 100, 'num_leaves': 649, 'min_child_samples': 1, 'min_data_per_groups': 54}. Best is trial 31 with value: 0.712729771175414.[0m
[32m[I 2022-09-22 15:52:51,389][0m Trial 44 finished with value: 0.6483656799255443 and parameters: {'n_estimators': 1760, 'reg_alpha': 2.171905533274347, 'reg_lambda': 3.880228871587821, 'colsample_byt

[32m[I 2022-09-22 15:59:13,653][0m Trial 63 finished with value: 0.7049807322341196 and parameters: {'n_estimators': 791, 'reg_alpha': 1.915934271932855, 'reg_lambda': 0.6157138789347104, 'colsample_bytree': 0.3, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 100, 'num_leaves': 712, 'min_child_samples': 12, 'min_data_per_groups': 72}. Best is trial 31 with value: 0.712729771175414.[0m
[32m[I 2022-09-22 15:59:22,975][0m Trial 64 finished with value: 0.6988064949749745 and parameters: {'n_estimators': 665, 'reg_alpha': 3.8690410093911667, 'reg_lambda': 1.1229099874334885, 'colsample_bytree': 0.3, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 100, 'num_leaves': 540, 'min_child_samples': 33, 'min_data_per_groups': 24}. Best is trial 31 with value: 0.712729771175414.[0m
[32m[I 2022-09-22 15:59:35,814][0m Trial 65 finished with value: 0.6978874465113587 and parameters: {'n_estimators': 545, 'reg_alpha': 7.24746221615744, 'reg_lambda': 2.6871275233879275, 'colsample_byt

[32m[I 2022-09-22 16:07:13,114][0m Trial 84 finished with value: 0.6884300945947444 and parameters: {'n_estimators': 626, 'reg_alpha': 9.990301995418035, 'reg_lambda': 0.0064852108392073225, 'colsample_bytree': 0.3, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 100, 'num_leaves': 667, 'min_child_samples': 17, 'min_data_per_groups': 57}. Best is trial 31 with value: 0.712729771175414.[0m
[32m[I 2022-09-22 16:07:28,108][0m Trial 85 finished with value: 0.7005133584286937 and parameters: {'n_estimators': 1886, 'reg_alpha': 3.4426128543241634, 'reg_lambda': 0.8473590621439382, 'colsample_bytree': 0.3, 'subsample': 0.8, 'learning_rate': 0.01, 'max_depth': 100, 'num_leaves': 438, 'min_child_samples': 35, 'min_data_per_groups': 76}. Best is trial 31 with value: 0.712729771175414.[0m
[32m[I 2022-09-22 16:08:02,330][0m Trial 86 finished with value: 0.7081329240298482 and parameters: {'n_estimators': 1790, 'reg_alpha': 7.1884198739955, 'reg_lambda': 0.35869415514998093, 'colsampl

Accuracy: 0.712729771175414
Best hyperparameters: {'n_estimators': 1931, 'reg_alpha': 5.999868782231083, 'reg_lambda': 0.008877562474977126, 'colsample_bytree': 0.3, 'subsample': 0.5, 'learning_rate': 0.01, 'max_depth': 100, 'num_leaves': 443, 'min_child_samples': 2, 'min_data_per_groups': 36}
CPU times: total: 50.8 s
Wall time: 35min 51s


In [55]:
best_model = study.best_params
best_model

{'n_estimators': 1931,
 'reg_alpha': 5.999868782231083,
 'reg_lambda': 0.008877562474977126,
 'colsample_bytree': 0.3,
 'subsample': 0.5,
 'learning_rate': 0.01,
 'max_depth': 100,
 'num_leaves': 443,
 'min_child_samples': 2,
 'min_data_per_groups': 36}

In [58]:
lgbm = lgb.LGBMClassifier(n_estimators= 1931,
 reg_alpha= 5.999868782231083,
 reg_lambda= 0.008877562474977126,
 colsample_bytree= 0.3,
 subsample= 0.5,
 learning_rate= 0.01,
 max_depth= 100,
 num_leaves= 443,
 min_child_samples= 2,
 min_data_per_groups= 36).fit(X,y)
score =  lgbm.score(X,y)
print(score)

0.8511756206488901


In [59]:
###############################    Making Predictions with optimised lightgbm+BOW+manual Feat with optuna    #################################
y_pred = lgbm.predict(x)
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission9.csv',index=False)
output

Unnamed: 0,id,Target
0,0,0
1,2,1
2,3,1
3,9,0
4,11,0
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1


In [None]:
## - no improvement

# Model Type 4 - Word2Vec

In [28]:
import spacy 
import en_core_web_lg

In [29]:
nlp = en_core_web_lg.load()

In [30]:
def get_vector(a):
    doc = nlp(a)
    vec = doc.vector
    return vec

In [31]:
df_train['vec'] = df_train['text'].apply(lambda a:get_vector(a))

In [32]:
X = df_train['vec'].to_numpy()
X = X.reshape(-1,1)

In [33]:
X.shape

(7613, 1)

In [34]:
X = np.concatenate(np.concatenate(X,axis=0),axis=0).reshape(-1,300)
X.shape

(7613, 300)

In [35]:
y = df_train['target']

In [36]:
lightgbm = lgb.LGBMClassifier(random_state=10)

lightgbm.fit(X,y)
score = lightgbm.score(X,y)
print(score)

0.9446998555103113


In [37]:
df_test['vec'] = df_test['text'].apply(lambda a:get_vector(a))

In [38]:
test_vectorizer = df_test['vec'].to_numpy()
test_vectorizer = test_vectorizer.reshape(-1,1)
x = np.concatenate(np.concatenate(test_vectorizer,axis=0),axis=0).reshape(-1,300)
x.shape

(3263, 300)

In [40]:
###############################    Making Predictions with lgbm using word2vec model     #################################
y_pred = lightgbm.predict(x)

y_pred = lightgbm.predict(x)
df_test['Target'] = y_pred

output = df_test[['id','Target']]
# Output to csv
output.to_csv(r'D:\Excel files\kaggle_disastertweets\submission8.csv',index=False)
output

Unnamed: 0,id,Target
0,0,1
1,2,0
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,1
3259,10865,1
3260,10868,1
3261,10874,1
