In [6]:
import pandas as pd

In [3]:
# read data
df = pd.read_csv('Data_drop_year.csv')

In [4]:
from IPython.display import Markdown, display
def printmd(string):
    display(Markdown(string))
#printmd('**bold**')

In [7]:
print("Number of rows in data =",df.shape[0])
print("Number of columns in data =",df.shape[1])
print("\n")
printmd("**Sample data:**")

Number of rows in data = 87646
Number of columns in data = 16




**Sample data:**

In [8]:
categories = list(df.columns.values)
categories = categories[6:]

In [11]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, random_state=42, test_size=0.20, shuffle=True)

print(train.shape)
print(test.shape)

(70116, 16)
(17530, 16)


In [12]:
train_text = train['Comment']
test_text = test['Comment']

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(train_text)
vectorizer.fit(test_text)

In [14]:
x_train = vectorizer.transform(train_text)
y_train = train.drop(labels = ['Comment','Name','Time','Store'], axis=1)

x_test = vectorizer.transform(test_text)
y_test = test.drop(labels = ['Comment','Name','Time','Store'], axis=1)

In [None]:
#build decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.multiclass import OneVsRestClassifier

In [11]:
%%time
# Using pipeline for applying decision tree and one vs rest classifier
DT_pipeline = Pipeline([
                ('clf', OneVsRestClassifier(DecisionTreeClassifier(random_state=0), n_jobs=-1)),
            ])

for category in categories:
    printmd('**Processing {} comments...**'.format(category))
    
    # Training decision tree model on train data
    DT_pipeline.fit(x_train, train[category])
    
    # calculating test accuracy
    prediction = DT_pipeline.predict(x_test)
    print('Test accuracy is {}'.format(accuracy_score(test[category], prediction)))
    print("\n")

**Processing  'data integration' comments...**

Test accuracy is 0.9814026013539211




**Processing  'marketing and communication' comments...**

Test accuracy is 0.9743287442001978




**Processing  'technology' comments...**

Test accuracy is 0.8986841104434472




**Processing  'payment and checkout' comments...**

Test accuracy is 0.9234806419715524




**Processing  'shopping experience' comments...**

Test accuracy is 0.9271316650186354




**Processing  'unemployment' comments...**

Test accuracy is 0.9607134707537841




**Processing  'product available and store design' comments...**

Test accuracy is 0.96318551760858




**Processing  'price and value' comments...**

Test accuracy is 0.9528789837985853




**Processing  'general' comments...**

Test accuracy is 0.9339393017418423




**Processing  'privacy and security' comments...**

Test accuracy is 0.959002053700464


CPU times: total: 969 ms
Wall time: 17min 33s


# Test ROC

In [12]:
# create train and test data
X_train, X_test, y_train, y_test = train_test_split(df['Comment'], df.drop(['Name','Time','Store','Comment','Aspect','polarity'],axis=1), test_size=0.2, random_state=42)

In [13]:
y_train.head()

Unnamed: 0,'data integration','marketing and communication','technology','payment and checkout','shopping experience','unemployment','product available and store design','price and value','general','privacy and security'
1647,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
39155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
52561,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21969,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
87221,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [14]:
# create a pipeline for decision tree with onevsrest classifier and tfidf vectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.tree import DecisionTreeClassifier

#import roc_auc_score
from sklearn.metrics import roc_auc_score

# create a pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', OneVsRestClassifier(DecisionTreeClassifier(random_state=0), n_jobs=-1)),
])

# fit the training dataset on the pipeline
pipeline.fit(X_train, y_train)

# make predictions on the test set
predictions = pipeline.predict(X_test)

# calculate accuracy
accuracy = accuracy_score(y_test,predictions)
print("Accuracy = ", accuracy)

# calculate roc_auc_score
roc_auc = roc_auc_score(y_test,predictions)
print("ROC_AUC_Score = ", roc_auc)

Accuracy =  0.7033086138049058
ROC_AUC_Score =  0.9082432039848973


In [15]:
y_train_pred = pipeline.predict_proba(X_train)
y_test_pred = pipeline.predict_proba(X_test)



In [16]:
roc_auc_score_train = roc_auc_score(y_train, y_train_pred, average='weighted')
roc_auc_score_test = roc_auc_score(y_test, y_test_pred, average='weighted')

print("roc_auc_score_train = ", roc_auc_score_train)
print("roc_auc_score_test = ", roc_auc_score_test)


roc_auc_score_train =  0.999999633409527
roc_auc_score_test =  0.917772122648156


# Create pipeline with k-fold cross validation


In [37]:
# new train and test data
X_train, X_test, y_train, y_test = train_test_split(df['Comment'], df.drop(['Name','Time','Store','Comment','Aspect','polarity'],axis=1), test_size=0.2, random_state=42)
y_train.head()

Unnamed: 0,'data integration','marketing and communication','technology','payment and checkout','shopping experience','unemployment','product available and store design','price and value','general','privacy and security'
1647,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
39155,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
52561,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21969,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
87221,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [38]:
# vectorize the text 
from sklearn.feature_extraction.text import TfidfVectorizer
TfidfVectorizer = TfidfVectorizer(max_df=0.8, max_features=5000, ngram_range=(1,2))

# fit and transform train and test features
X_train_tfidf = TfidfVectorizer.fit_transform(X_train)
X_test_tfidf = TfidfVectorizer.transform(X_test)

In [39]:
# cross validation for logistic regression with onevsrest classifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier

# onerest classifier with kfold cv
clf = OneVsRestClassifier(DecisionTreeClassifier(random_state=0), n_jobs=-1)
scores = cross_val_score(clf, X_train_tfidf, y_train, cv=10, scoring='roc_auc')

# print mean score and 10 fold scores
print("Mean ROC AUC score: ", scores.mean())
print("ROC AUC scores: ", scores)


Mean ROC AUC score:  0.8966580305815193
ROC AUC scores:  [0.89584775 0.89506272 0.89776839 0.89828844 0.89994644 0.89672326
 0.8994449  0.89491632 0.89415776 0.89442433]


In [54]:
# cross validation for logistic regression with onevsrest classifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier

# onerest classifier with kfold cv
clf = OneVsRestClassifier(DecisionTreeClassifier(random_state=0), n_jobs=-1)
scores = cross_val_score(clf, X_train_tfidf, y_train, cv=10, scoring='accuracy')


# print mean score and 10 fold scores
print("Mean accuracy: ", scores.mean())
print("Accuracy: ", scores)


Mean accuracy:  0.8474811340796627
Accuracy:  [0.84940103 0.8491158  0.85011409 0.85054193 0.84469481 0.85382202
 0.84424476 0.83896734 0.84937955 0.84453002]


In [52]:
# cross validation for logistic regression with onevsrest classifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier

# onerest classifier with kfold cv
clf = OneVsRestClassifier(DecisionTreeClassifier(random_state=0), n_jobs=-1)
scores = cross_val_score(clf, X_train_tfidf, y_train, cv=10, scoring='f1_macro')

#print mean score and 10 fold scores
print("F1: ", scores.mean())


F1:  0.8414131466841196


In [55]:
# cross validation for logistic regression with onevsrest classifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier

# onerest classifier with kfold cv
clf = OneVsRestClassifier(DecisionTreeClassifier(random_state=0), n_jobs=-1)
scores = cross_val_score(clf, X_train_tfidf, y_train, cv=10, scoring='recall_macro')

#print mean score and 10 fold scores
print("Recall: ", scores.mean())


Recall:  0.8389614232362709


In [56]:
# cross validation for logistic regression with onevsrest classifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier

# onerest classifier with kfold cv
clf = OneVsRestClassifier(DecisionTreeClassifier(random_state=0), n_jobs=-1)
scores = cross_val_score(clf, X_train_tfidf, y_train, cv=10, scoring='precision_macro')

#print mean score and 10 fold scores
print("Precision: ", scores.mean())


Precision:  0.8578772278107119


In [57]:
#cross_validate
from sklearn.model_selection import cross_validate
scoring = ['accuracy', 'f1_macro', 'recall_macro', 'precision_macro', 'roc_auc']
scores=cross_validate(clf, X_train_tfidf, y_train)

#print output
print('Accuracy: ', scores['test_accuracy'].mean())
print('F1: ', scores['test_f1_macro'].mean())
print('Recall: ', scores['test_recall_macro'].mean())
print('Precision: ', scores['test_precision_macro'].mean())
print('AUC: ', scores['test_roc_auc'].mean())

KeyError: 'test_accuracy'

In [None]:
#cross validate
from sklearn.model_selection import cross_validate
scoring = ['accuracy', 'f1_macro', 'recall_macro', 'precision_macro', 'roc_auc_ovr']
scores = cross_validate(clf, X_train_tfidf, y_train)

# print mean score and 10 fold scores
print("Mean ROC AUC score: ", scores.mean())

In [None]:
#calcuate f1 score of one rest classifier with kfolds
from sklearn.model_selection import cross_val_score
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
f1_scorer = make_scorer(f1_score, average='micro')
scores = cross_val_score(pipeline, X_train, y_train, cv=10, scoring=f1_scorer)
print(scores.mean())


0.8696018960669984


In [None]:
#calcuate f1 score of one rest classifier with kfolds
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_predict

# predict on training data
y_train_pred = cross_val_predict(clf, X_train_tfidf, y_train, cv=10)

# calculate f1 score
f1_score(y_train, y_train_pred, average='weighted')

# predict on test data
y_test_pred = cross_val_predict(clf, X_test_tfidf, y_test, cv=10)


# Sentiment

In [None]:
# new train test list for predicting polarity
X_train, X_test, y_train, y_test = train_test_split(df['Comment'], df['polarity'], test_size=0.2, random_state=42)
# build a pipeline for logistic regression with onevsrest classifier and tfidf vectorizer with cross validation
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# create a pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(max_df=0.8, max_features=5000, ngram_range=(1,2))),
    ('clf', OneVsRestClassifier(DecisionTreeClassifier(random_state=0), n_jobs=-1)),
])

# kfold split
from sklearn.model_selection import KFold
kfold = KFold(n_splits=10, random_state=42, shuffle=True)

# cross validation
results = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='accuracy')

# print mean score and 10 fold scores
print("Mean accuracy: ", results.mean())
print("ROC accuracy: ", results.mean)

Mean accuracy:  0.8484796810618601
ROC accuracy:  <built-in method mean of numpy.ndarray object at 0x00000192C9EDA030>


In [61]:
# cross_val_predict for logPolarity with onevsrest classifier 10 fold cv
from sklearn.model_selection import cross_val_predict
y_train_pred = cross_val_predict(pipeline, X_train, y_train, cv=10)

In [None]:
#cross validation score calculate roc_auc_score
from sklearn.model_selection import cross_val_score
roc_auc_score = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='roc_auc_ovr')
print (roc_auc_score.mean())

In [None]:
#cross validation score calculate f1 score
from sklearn.model_selection import cross_val_score
f1 = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='f1_macro')
print (f1.mean())

#cross validation score calculate precision
from sklearn.model_selection import cross_val_score
precision = cross_val_score(pipeline,X_train, y_train, cv=kfold, scoring='precision_macro')
print (precision.mean())

#cross validation score calculate recall
from sklearn.model_selection import cross_val_score
recall = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='recall_macro')
print (recall.mean())

#cross validation score calculate accuracy
from sklearn.model_selection import cross_val_score
accuracy = cross_val_score(pipeline,X_train, y_train, cv=kfold, scoring='accuracy')
print (accuracy.mean())

#cross validation score calculate roc_auc_score
from sklearn.model_selection import cross_val_score
roc_auc_score = cross_val_score(pipeline, X_train, y_train, cv=kfold, scoring='roc_auc_ovr')
print (roc_auc_score.mean())


0.842116929300156
0.857818723848454
0.8398351792260058
0.8484796810618601


  Y /= np.sum(Y, axis=1)[:, np.newaxis]
Traceback (most recent call last):
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_scorer.py", line 312, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_ranking.py", line 550, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py", line 899, in check_array
    _assert_all_finite(

nan


  Y /= np.sum(Y, axis=1)[:, np.newaxis]
Traceback (most recent call last):
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_scorer.py", line 312, in _score
    return self._sign * self._score_func(y, y_pred, **self._kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_ranking.py", line 550, in roc_auc_score
    y_score = check_array(y_score, ensure_2d=False)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py", line 899, in check_array
    _assert_all_finite(

In [63]:
#cross_validate
from sklearn.model_selection import cross_validate
scoring = ['accuracy', 'f1_macro', 'recall_macro', 'precision_macro', 'roc_auc_ovr']
scores=cross_validate(clf, X_train, y_train, scoring=scoring, cv=10, return_train_score=True)

#print output
print('Accuracy: ', scores['test_accuracy'].mean())
print('F1: ', scores['test_f1_macro'].mean())
print('Recall: ', scores['test_recall_macro'].mean())
print('Precision: ', scores['test_precision_macro'].mean())
print('AUC: ', scores['test_roc_auc'].mean())

ValueError: 
All the 10 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\process_executor.py", line 428, in _process_worker
    r = call_item()
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\process_executor.py", line 275, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\_parallel_backends.py", line 620, in __call__
    return self.func(*args, **kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 288, in __call__
    return [func(*args, **kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 288, in <listcomp>
    return [func(*args, **kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\fixes.py", line 117, in __call__
    return self.function(*args, **kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\multiclass.py", line 83, in _fit_binary
    estimator.fit(X, y)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 969, in fit
    super().fit(
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 172, in fit
    X, y = self._validate_data(
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 591, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\series.py", line 872, in __array__
    return np.asarray(self._values, dtype)
ValueError: could not convert string to float: 'fool not buy new phone every year like rest smart people'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\multiclass.py", line 327, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 1098, in __call__
    self.retrieve()
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 975, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\_parallel_backends.py", line 567, in wrap_future_result
    return future.result(timeout=timeout)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\concurrent\futures\_base.py", line 445, in result
    return self.__get_result()
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\concurrent\futures\_base.py", line 390, in __get_result
    raise self._exception
ValueError: could not convert string to float: 'fool not buy new phone every year like rest smart people'

--------------------------------------------------------------------------------
9 fits failed with the following error:
joblib.externals.loky.process_executor._RemoteTraceback: 
"""
Traceback (most recent call last):
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\process_executor.py", line 428, in _process_worker
    r = call_item()
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\externals\loky\process_executor.py", line 275, in __call__
    return self.fn(*self.args, **self.kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\_parallel_backends.py", line 620, in __call__
    return self.func(*args, **kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 288, in __call__
    return [func(*args, **kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 288, in <listcomp>
    return [func(*args, **kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\fixes.py", line 117, in __call__
    return self.function(*args, **kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\multiclass.py", line 83, in _fit_binary
    estimator.fit(X, y)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 969, in fit
    super().fit(
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\tree\_classes.py", line 172, in fit
    X, y = self._validate_data(
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\base.py", line 591, in _validate_data
    X = check_array(X, input_name="X", **check_X_params)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\validation.py", line 856, in check_array
    array = np.asarray(array, order=order, dtype=dtype)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\pandas\core\series.py", line 872, in __array__
    return np.asarray(self._values, dtype)
ValueError: could not convert string to float: 'hear happen state next though'
"""

The above exception was the direct cause of the following exception:

Traceback (most recent call last):
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\multiclass.py", line 327, in fit
    self.estimators_ = Parallel(n_jobs=self.n_jobs, verbose=self.verbose)(
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 1098, in __call__
    self.retrieve()
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\parallel.py", line 975, in retrieve
    self._output.extend(job.get(timeout=self.timeout))
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\joblib\_parallel_backends.py", line 567, in wrap_future_result
    return future.result(timeout=timeout)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\concurrent\futures\_base.py", line 445, in result
    return self.__get_result()
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\concurrent\futures\_base.py", line 390, in __get_result
    raise self._exception
ValueError: could not convert string to float: 'hear happen state next though'


In [64]:
# cross validation for logistic regression with onevsrest classifier
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier

# onerest classifier with kfold cv
clf = OneVsRestClassifier(DecisionTreeClassifier(random_state=0), n_jobs=-1)
scores = cross_val_score(clf, X_train_tfidf, y_train, cv=10, scoring='roc_auc')

# print mean score and 10 fold scores
print("Mean ROC AUC score: ", scores.mean())
print("ROC AUC scores: ", scores)


Traceback (most recent call last):
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_scorer.py", line 352, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported

Traceback (most recent call last):
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_scorer.py", line 106

Mean ROC AUC score:  nan
ROC AUC scores:  [nan nan nan nan nan nan nan nan nan nan]


Traceback (most recent call last):
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\model_selection\_validation.py", line 767, in _score
    scores = scorer(estimator, X_test, y_test)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_scorer.py", line 106, in __call__
    score = scorer._score(cached_call, estimator, *args, **kwargs)
  File "c:\Users\DELL\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\metrics\_scorer.py", line 352, in _score
    raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported



## test acc

In [None]:
#create a pepline for the model to predict polarity
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import cross_val_score

In [None]:
#split data into train and test
X1 = df['Comment']
y1 = df['polarity']
X_train1, X_test1, y_train1, y_test1 = train_test_split(X1, y1, test_size=0.2, random_state=42)

#build a pipeline
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('clf', DecisionTreeClassifier()),
])



In [None]:
#fit the model
pipeline.fit(X_train1, y_train1)

#predict on test data
predictions = pipeline.predict(X_test1)

#calculate accuracy
accuracy = accuracy_score(y_test1,predictions)
print("Accuracy = ", accuracy)

#calculate roc_auc_score
roc_auc = roc_auc_score(y_test1,predictions)
print("ROC_AUC_Score = ", roc_auc)


Accuracy =  0.873873359954364


ValueError: could not convert string to float: 'neutral'

In [None]:
#fit the model
pipeline.fit(X_train1, y_train1)

#predict the model
y_pred1 = pipeline.predict(X_test1)

#check the accuracy
print('accuracy %s' % accuracy_score(y_pred1, y_test1))
print(classification_report(y_test1, y_pred1))\

# roc_auc_score
print (roc_auc_score(y_test1, y_pred1))

accuracy 0.8758699372504278
              precision    recall  f1-score   support

    negative       0.81      0.81      0.81      5184
     neutral       0.96      0.96      0.96      6257
    positive       0.84      0.84      0.84      6089

    accuracy                           0.88     17530
   macro avg       0.87      0.87      0.87     17530
weighted avg       0.88      0.88      0.88     17530



ValueError: could not convert string to float: 'neutral'

In [None]:
#confusion matrix
confusion_matrix(y_test1, y_pred1)

array([[4193,  127,  864],
       [ 118, 6022,  117],
       [ 846,  104, 5139]], dtype=int64)

In [None]:
# k-fold cross validation 

accuracies = cross_val_score(estimator = pipeline, X = X_train1, y = y_train1, cv = 10)
print("Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Accuracy: 86.90 %
Standard Deviation: 0.42 %
