In [6]:
!pip install textacy
!pip install catboost
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
%matplotlib inline
import matplotlib.pyplot as plt
from sklearn import tree
from sklearn.model_selection import train_test_split
import string
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
import textacy
from textblob import TextBlob
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from wordcloud import WordCloud

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
train_original = pd.read_csv('https://raw.githubusercontent.com/maxrinal/nlp-disaster/master/train.csv')
test_original = pd.read_csv('https://raw.githubusercontent.com/maxrinal/nlp-disaster/master/test.csv')

In [8]:
train=train_original.drop(columns=['keyword', 'location'])
test = test_original.drop(columns=['keyword', 'location'])
train['text'] = train['text'].str.lower()
test['text'] = test['text'].str.lower()

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

train['text_clean'] = train.text.apply(lambda x: remove_punctuation(x))
test['text_clean'] = test.text.apply(lambda x: remove_punctuation(x))

STOPWORDS = set(stopwords.words('english') +  ['u', 'ü', 'ur', '4', '2', 'im', 'dont', 'doin', 'ure'])
def stopwords_(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

train['text_clean'] = train['text_clean'].apply(lambda text: stopwords_(text))
test['text_clean'] = test['text_clean'].apply(lambda text: stopwords_(text))

In [9]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()
def lemmatizer_(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text.split()])

train['text_clean'] = train['text_clean'].apply(lambda text: lemmatizer_(text))
test['text_clean'] = test['text_clean'].apply(lambda text: lemmatizer_(text))

In [11]:
train.text_clean = train.text_clean.str.translate(str.maketrans('','','1234567890'))
test.text_clean = test.text_clean.str.translate(str.maketrans('','','1234567890'))

train['text_clean'] = train['text_clean'].apply(lambda x: ' '.join( [ a for a in x.split() if len(a)> 1 ]))
test['text_clean'] = test['text_clean'].apply(lambda x: ' '.join( [ a for a in x.split() if len(a)> 1 ]))

In [12]:
from sklearn.model_selection import train_test_split

seed = 42

X = train.text_clean
y = train.target

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=seed)

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [14]:
def acc_summary(pipeline, X_train, y_train, X_test, y_test):
    sentiment_fit = pipeline.fit(X_train, y_train)
    y_pred = sentiment_fit.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
   
    print("-"*30)
    
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    
    print("-"*30)
    
    return accuracy

In [16]:
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest","XGBoost", "Logistic Regression","Naive Bayes", "SVC"]

classifiers = [
    KNeighborsClassifier(n_neighbors=3),
    DecisionTreeClassifier(random_state=0),
    RandomForestClassifier(n_estimators=100),
    xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 20, alpha = 10, n_estimators = 150),
    LogisticRegression(),
    MultinomialNB(),
    SVC(kernel="linear")
]
    
zipped_clf = zip(names, classifiers)
tvec = TfidfVectorizer()
stop_words = set(stopwords.words("english"))

def compare_clf(classifier=zipped_clf, vectorizer=tvec, n_features=10000, ngram_range=(1, 1)):
    result = []
    vectorizer.set_params(stop_words=stop_words, max_features=n_features, ngram_range=ngram_range)
    for n, c in classifier:
        checker_pipeline = Pipeline([
            ("vectorizer", vectorizer),
            ("classifier", c)
        ])
        clf_acc = acc_summary(checker_pipeline, X_train, y_train, X_test, y_test)
        print("Model result for {}".format(n))
        print(c)
        result.append((n, clf_acc))
    return result

In [17]:
trigram_result = compare_clf()

------------------------------
accuracy score: 70.27%
------------------------------
Model result for K Nearest Neighbors
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')
------------------------------
accuracy score: 72.02%
------------------------------
Model result for Decision Tree
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=0, splitter='best')
------------------------------
accuracy score: 76.84%
------------------------------
Model result for Random Forest
RandomForestClassifier(bootstrap=Tr

In [18]:
trigram_result

[('K Nearest Neighbors', 0.7027145359019265),
 ('Decision Tree', 0.7202276707530648),
 ('Random Forest', 0.7683887915936952),
 ('XGBoost', 0.7762697022767076),
 ('Logistic Regression', 0.8112959719789843),
 ('Naive Bayes', 0.8042907180385289),
 ('SVC', 0.7968476357267951)]

In [None]:

def prediction(pipeline, testtext):
    sentiment_fit = pipeline.fit(X_train,y_train)
    y_pred = sentiment_fit.predict(testtext)
    
    return y_pred

In [None]:
vectorizer=TfidfVectorizer()
checker_pipeline = Pipeline([
            ('vectorizer', vectorizer),
            ('classifier', KNeighborsClassifier(n_neighbors=6))
        ])
vectorizer.set_params(stop_words=None, max_features=100000, ngram_range=(1,4))
prediction=prediction(checker_pipeline,test['text'])

In [None]:
prediction_random_forest = pd.DataFrame({'id':test.id, 'target':prediction} )

In [None]:
prediction_logistic_Regresion =  pd.DataFrame({'id':test.id, 'target':prediction} )

In [None]:
prediction_knn = pd.DataFrame({'id':test.id, 'target':prediction} )

In [None]:
result = pd.DataFrame({'id':test.id, 'target':prediction} )
result.to_csv('prueba_random_forest.csv',header=True,index = False)

In [None]:
from functools import reduce
dfs = [prediction_random_forest, prediction_logistic_Regresion, prediction_knn]
df_final = reduce(lambda left,right: pd.merge(left,right,on='id'), dfs)
df_final['target_final'] =((df_final['target_x'] + df_final['target_y'] + df_final['target']) / 3).round()

df_final = df_final[['id', 'target_final']]
df_final.columns = ['id', 'target']
df_final

Unnamed: 0,id,target
0,0,0.0
1,2,1.0
2,3,1.0
3,9,0.0
4,11,1.0
...,...,...
3258,10861,1.0
3259,10865,0.0
3260,10868,1.0
3261,10874,1.0


In [None]:
df_final.to_csv('prueba_combino_randomForest_Knn_regresionLogistica.csv',header=True,index = False)