# feature Selection

In [2]:
import pandas as pd 
import numpy  as np

In [798]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
clnd_data=pd.read_csv("cleaned_tweets_2")

In [803]:
tfidf_vctrr= TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
tfidf_features = tfidf_vctrr.fit_transform(clnd_data['text']).toarray()

In [806]:
X=tfidf_features
Y=clnd_data["airline_sentiment"]

In [807]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(tfidf_features, Y, test_size=0.2, random_state=0)

In [831]:
X_train

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

# Building the Model 

In [813]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [811]:
rand_forest = RandomForestClassifier(n_estimators=200, random_state=0)
rand_forest.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [812]:
y_pred = rand_forest.predict(X_test)


In [814]:

print(confusion_matrix(y_test,y_pred))
print(classification_report(y_test,y_pred))
print(accuracy_score(y_test, y_pred))

[[1581  175   64]
 [ 280  265   60]
 [ 120   64  262]]
              precision    recall  f1-score   support

    negative       0.80      0.87      0.83      1820
     neutral       0.53      0.44      0.48       605
    positive       0.68      0.59      0.63       446

    accuracy                           0.73      2871
   macro avg       0.67      0.63      0.65      2871
weighted avg       0.72      0.73      0.73      2871

0.7342389411354928


In [815]:
from sklearn.model_selection import RandomizedSearchCV


In [818]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)

{'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


In [820]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestClassifier()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 47.0min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 325.9min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 647.9min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestClassifier(bootstrap=True,
                                                    class_weight=None,
                                                    criterion='gini',
                                                    max_depth=None,
                                                    max_features='auto',
                                                    max_leaf_nodes=None,
                                                    min_impurity_decrease=0.0,
                                                    min_impurity_split=None,
                                                    min_samples_leaf=1,
                                                    min_samples_split=2,
                                                    min_weight_fraction_leaf=0.0,
                                                    n_estimators='warn',
                                                    n_jobs=None

In [821]:
rf_random.best_params_


{'n_estimators': 800,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': None,
 'bootstrap': True}

In [822]:
best_random = rf_random.best_estimator_


In [827]:
y_pred_opt = best_random.predict(X_test)


In [828]:

print(confusion_matrix(y_test,y_pred_opt))
print(classification_report(y_test,y_pred_opt))
print(accuracy_score(y_test, y_pred_opt)*100)

[[1657  103   60]
 [ 339  210   56]
 [ 155   36  255]]
              precision    recall  f1-score   support

    negative       0.77      0.91      0.83      1820
     neutral       0.60      0.35      0.44       605
    positive       0.69      0.57      0.62       446

    accuracy                           0.74      2871
   macro avg       0.69      0.61      0.63      2871
weighted avg       0.72      0.74      0.72      2871

73.91152908394287


In [None]:
#before hyperparamter tuning the accuracy was 73.4 
#after hyperparamter tuning the accuracy increased .5% (73.9)


In [862]:
y_pred_opt

array(['negative', 'positive', 'negative', ..., 'negative', 'negative',
       'negative'], dtype=object)

In [None]:
best_random.predict()

In [832]:
Y=clnd_data["airline_sentiment"]

In [834]:
Y

0        positive
1         neutral
2        negative
3        negative
4        negative
5        positive
6         neutral
7        positive
8         neutral
9        positive
10       positive
11       positive
12       positive
13       negative
14       positive
15       negative
16       positive
17       positive
18       negative
19       positive
20       positive
21        neutral
22       negative
23       negative
24       negative
25        neutral
26       negative
27        neutral
28       negative
29        neutral
           ...   
14323    negative
14324     neutral
14325    negative
14326    negative
14327    negative
14328    negative
14329    negative
14330    positive
14331    negative
14332    positive
14333    negative
14334    negative
14335    negative
14336    positive
14337    negative
14338    positive
14339    negative
14340    negative
14341    positive
14342    negative
14343    positive
14344    negative
14345     neutral
14346    negative
14347    n

In [838]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

In [847]:
def encoding_labels(data_,col_name,file_name):
    """this function takes the DataFrame, column name and the file name
    then encodes the catigorical variable using LabelEncoder & OneHotEncoder 
    and save the new data frame to a csv file 
    then returns the encoder instance, the Dataframe with the encoded variables with two additional columns Male-Female 
    and another dataframe with encoded gender column """
    #airline_sentiment
    data_lbl=data_.copy()
    data_pure=data_.copy()

    sent_encoding=LabelEncoder()
    sent_labels=sent_encoding.fit_transform(data_pure[col_name])
    
    encoding_objects={}
    

    encoding_objects[col_name]=LabelEncoder()
    data_pure[col_name]=encoding_objects[col_name].fit_transform(data_pure[col_name])
            
    #print(encoding_objects)
    
    sent_lbl_enc=LabelEncoder()
    sent_1_hot_enc=OneHotEncoder()
    
    data_lbl[col_name]=sent_lbl_enc.fit_transform(data_lbl[col_name])
    sent_values=sent_1_hot_enc.fit_transform(data_lbl[col_name].values.reshape(-1,1)).toarray()
    
    #print(gender_values)
    #print(data_lbl["Gender"])
    
    cols_sent=sent_lbl_enc.inverse_transform(data_lbl[col_name].unique())
    for i in range(len(cols_sent)):
        data_lbl[cols_sent[i]]=sent_values[:,i]
    #print(cols_gender)
    
    data_lbl.to_csv(file_name,index=False)
    data_pure.to_csv("encoded_labeled_df.csv",index=False)
    return sent_lbl_enc, data_lbl, data_pure

In [848]:
file_name="Sent_encoded"

In [849]:
encoding_labels(clnd_data,"airline_sentiment",file_name)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


(LabelEncoder(),
        airline_sentiment                                               text  \
 0                      2      'added', 'commercials', 'experience', 'tacky'   
 1                      1                                 'today', 'another'   
 2                      0  'really', 'aggressive', 'blast', 'obnoxious', ...   
 3                      0                                  'really', 'thing'   
 4                      0  'seriously', 'would', 'flight', 'seats', 'play...   
 5                      2                                  'nearly', 'every'   
 6                      1  'really', 'missed', 'prime', 'opportunity', 'w...   
 7                      2                      'amazing', 'arrived', 'early'   
 8                      1  'suicide', 'second', 'leading', 'cause', 'deat...   
 9                      2  'pretty', 'graphics', 'better', 'minimal', 'ic...   
 10                     2                     'great', 'already', 'thinking'   
 11                    

In [851]:
new_clnd=pd.read_csv("encoded_labeled_df.csv")

In [852]:
tfidf_vctrr_2= TfidfVectorizer(max_df=0.90, min_df=2, max_features=1000, stop_words='english')
tfidf_features_2 = tfidf_vctrr_2.fit_transform(new_clnd['text']).toarray()

In [853]:
X=tfidf_features_2
Y=new_clnd["airline_sentiment"]

In [858]:
from sklearn.model_selection import train_test_split

X_train2, X_test2, y_train2, y_test2 = train_test_split(tfidf_features_2, Y, test_size=0.2, random_state=0)

In [859]:
rand_forest_2 = RandomForestClassifier(n_estimators=200, random_state=0)
rand_forest_2.fit(X_train2, y_train2)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=None, oob_score=False, random_state=0, verbose=0,
                       warm_start=False)

In [860]:
y_pred2 = rand_forest_2.predict(X_test2)


In [861]:

print(confusion_matrix(y_test2,y_pred2))
print(classification_report(y_test2,y_pred2))
print(accuracy_score(y_test2, y_pred2))

[[1581  175   64]
 [ 280  265   60]
 [ 120   64  262]]
              precision    recall  f1-score   support

           0       0.80      0.87      0.83      1820
           1       0.53      0.44      0.48       605
           2       0.68      0.59      0.63       446

    accuracy                           0.73      2871
   macro avg       0.67      0.63      0.65      2871
weighted avg       0.72      0.73      0.73      2871

0.7342389411354928
