In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

In [2]:
def handle_nan(train_data,predict_data):
    '''Input: Data to the function containing Nan values.
       Output : Cleaned data containing no Nan values.
       Function: Cleaning Nan values.
     '''
    train = train_data.fillna(" ")
    predict  = predict_data.fillna(" ")
    return train,predict

In [3]:
train = pd.read_csv("train.csv")
predict = pd.read_csv("test.csv")


train,predict = handle_nan(train,predict)

X = train['text']
y = train['label']
p = predict['text']

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2)

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [4]:
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [5]:
clf.score(X_test,y_test)

0.8939903846153846

In [6]:
predictions_test = pd.DataFrame(clf.predict(p))
test_id = pd.DataFrame(predict["id"])

submission = pd.concat([test_id, predictions_test],axis=1)
submission.columns = ["id","Bayesian_label"]
submission.to_csv("ML_Bayesian__pipeline.csv",index=False)

In [7]:
from sklearn.model_selection import cross_val_score 
scores = cross_val_score(clf, X_test,y_test, scoring="neg_mean_squared_error", cv=10)

In [8]:
import numpy as np
tree_rmse_scores = np.sqrt(-scores)

In [9]:
print("Scores:", -scores)
print("Mean:", -scores.mean()) 
print("Standard deviation:", scores.std())

Scores: [0.13461538 0.10336538 0.125      0.13701923 0.11298077 0.11538462
 0.12259615 0.10576923 0.125      0.14182692]
Mean: 0.12235576923076923
Standard deviation: 0.01237222041582478
