# Scraping Reddit

In [9]:
import requests
import time

import pandas as pd

import numpy as np
import xgboost
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.svm import SVC
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.svm import LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.tree import export_text, DecisionTreeClassifier, plot_tree
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, plot_confusion_matrix


## API Calls

In [10]:
url = 'https://api.pushshift.io/reddit/search/submission'

params = {
    'subreddit': 'AskNYC',
    'size': 100,
    'before': 1631370974
}

In [11]:
res = requests.get(url, params=params)

In [12]:
data = res.json()
type(data)

dict

In [13]:
data['data'][-1]['created_utc']

1631280168

### Looping Ask NYC

In [14]:
url = 'https://api.pushshift.io/reddit/search/submission'
subreddit = 'AskNYC'
before = 1632922428

df_list = []

for _ in range(40):
        
    params = {
        'subreddit': subreddit,
        'size': 100,
        'before': before
    }

    res = requests.get(url, params=params)
    data = res.json()
    
    before = data['data'][-1]['created_utc']
    print(f'before updated to: {before}')
    
    post_df = pd.DataFrame(data['data'])
    df_list.append(post_df)
    
    time.sleep(3)

before updated to: 1632801035
before updated to: 1632713881
before updated to: 1632609492
before updated to: 1632513165
before updated to: 1632420108
before updated to: 1632338554
before updated to: 1632251959
before updated to: 1632162799
before updated to: 1632076839
before updated to: 1631935994
before updated to: 1631839850
before updated to: 1631744646
before updated to: 1631657563
before updated to: 1631564141
before updated to: 1631489423
before updated to: 1631386597
before updated to: 1631293600
before updated to: 1631207760
before updated to: 1631119654
before updated to: 1631035738
before updated to: 1630930246
before updated to: 1630799564
before updated to: 1630700273
before updated to: 1630626198
before updated to: 1630553525
before updated to: 1630460117
before updated to: 1630364458
before updated to: 1630281936
before updated to: 1630175896
before updated to: 1630078174
before updated to: 1629985328
before updated to: 1629861660
before updated to: 1629759255
before upd

In [15]:
AskNYC_df = pd.concat(df_list)

### Looping Ask LA

In [16]:
url = 'https://api.pushshift.io/reddit/search/submission'
subreddit = 'AskLosAngeles'
before = 1632922428

df_list = []

for _ in range(40):
        
    params = {
        'subreddit': subreddit,
        'size': 100,
        'before': before
    }

    res = requests.get(url, params=params)
    data = res.json()
    
    before = data['data'][-1]['created_utc']
    print(f'before updated to: {before}')
    
    post_df = pd.DataFrame(data['data'])
    df_list.append(post_df)
    
    time.sleep(3)
    
AskLosAngeles_df = pd.concat(df_list)

before updated to: 1632602022
before updated to: 1632328788
before updated to: 1631960486
before updated to: 1631652328
before updated to: 1631331222
before updated to: 1630983454
before updated to: 1630596608
before updated to: 1630307196
before updated to: 1630015984
before updated to: 1629743166
before updated to: 1629417990
before updated to: 1629005530
before updated to: 1628708334
before updated to: 1628450583
before updated to: 1628106375
before updated to: 1627769552
before updated to: 1627439222
before updated to: 1627100871
before updated to: 1626799672
before updated to: 1626458276
before updated to: 1626153959
before updated to: 1625867273
before updated to: 1625550012
before updated to: 1625261517
before updated to: 1624993155
before updated to: 1624593361
before updated to: 1624335115
before updated to: 1623990404
before updated to: 1623704953
before updated to: 1623379247
before updated to: 1623098727
before updated to: 1622726999
before updated to: 1622424026
before upd

In [17]:
NYC_LA_df = pd.concat([AskNYC_df,AskLosAngeles_df])

In [18]:
NYC_LA_df.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_is_blocked',
       'author_patreon_flair', 'author_premium', 'awarders', 'can_mod_post',
       'contest_mode', 'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_created_from_ads_ui', 'is_crosspostable', 'is_meta',
       'is_original_content', 'is_reddit_media_domain', 'is_robot_indexable',
       'is_self', 'is_video', 'link_flair_background_color',
       'link_flair_richtext', 'link_flair_text_color', 'link_flair_type',
       'locked', 'media_only', 'no_follow', 'num_comments', 'num_crossposts',
       'over_18', 'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'retrieved_on', 'score', 'selftext', 'send_replies', 'spoiler',
       'stickied', 'subreddit', 'subreddit_id', 'subreddit_subscribers',
       'subreddit_type', 'thumbnail', 'title', 'total_awards_rece

In [19]:
NYC_LA_df.shape

(7999, 70)

In [20]:
NYC_LA_df.shape

(7999, 70)

### Baseline accuracy

In [21]:
NYC_LA_df['subreddit'].value_counts(normalize=True)

AskNYC           0.500063
AskLosAngeles    0.499937
Name: subreddit, dtype: float64

### Dropna

In [27]:
NYC_LA_df_dropna = NYC_LA_df.dropna()

In [30]:
NYC_LA_df

Unnamed: 0,all_awardings,allow_live_comments,author,author_flair_css_class,author_flair_richtext,author_flair_text,author_flair_type,author_fullname,author_is_blocked,author_patreon_flair,...,post_hint,preview,author_flair_background_color,author_flair_text_color,edited,banned_by,author_cakeday,link_flair_template_id,link_flair_text,author_flair_template_id
0,[],False,ApartmentQuestion5,,[],,text,t2_8q93ft6k,False,False,...,,,,,,,,,,
1,[],False,dammets,,[],,text,t2_p3iwm,False,False,...,,,,,,,,,,
2,[],False,uncertainness,,[],,text,t2_3tlw4,False,False,...,,,,,,,,,,
3,[],False,lentil5oup,,[],,text,t2_fig5g,False,False,...,,,,,,,,,,
4,[],False,discondat,,[],,text,t2_5n02e37h,False,False,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,[],False,poli8999,,[],,text,t2_13lxbx,,False,...,,,,,,,,e19a32b0-f84e-11e9-ad04-0e323326f190,Transportation,
96,[],False,Very_Smart_Guy,,[],,text,t2_cze9289,,False,...,,,,,,,,5bd14530-a9ee-11eb-9c2c-0ec8f61982b3,Living,
97,[],False,livedinfrance,,[],,text,t2_a28w0ffu,,False,...,,,,,,,,f3f22b4a-a9ee-11eb-a2f5-0eb999ecf731,Things to do,
98,[],False,dolphinballoon,,[],,text,t2_5eq9qgdq,,False,...,,,,,,,,2a731092-a9f1-11eb-826d-0e22ed5e97f5,COVID-19,


## Count vectorize and train-test split

In [35]:
NYC_LA_df_copy = NYC_LA_df

In [42]:
test = NYC_LA_df[['selftext','subreddit']].dropna()

In [43]:
X = test['selftext']
y = test['subreddit']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

### Logistic Regression (tfid)

In [44]:
pipe = Pipeline([
    ('tf', TfidfVectorizer(min_df=2)),
    ('lr', LogisticRegressionCV(solver='liblinear'))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9368948247078465, 0.8202303455182774)

In [46]:
pred = pipe.predict(X_test)

In [47]:
test = pd.DataFrame(pred)

In [48]:
test[0].value_counts(normalize=True)

AskNYC           0.529294
AskLosAngeles    0.470706
Name: 0, dtype: float64

### Logistic Regression (Count Vectorize)

In [49]:
pipe = Pipeline([
    ('cv', CountVectorizer(min_df=2)),
    ('lr', LogisticRegressionCV(solver='liblinear'))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

(0.9480801335559266, 0.8157235853780671)

In [50]:
pred = pipe.predict(X_test)

In [51]:
test = pd.DataFrame(pred)

In [52]:
test[0].value_counts(normalize=True)

AskNYC           0.541813
AskLosAngeles    0.458187
Name: 0, dtype: float64

### Test phrases - LogReg

In [53]:
test = ['Happy', 'Nice','Smile','Fun']

In [54]:
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

array(['AskNYC', 'AskNYC', 'AskNYC', 'AskNYC'], dtype=object)

In [None]:
test = ['I am sad.','Sad','I am unhappy', 'Unhappy','Mad','Angry']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
test = ['Get away from me.', 'Your fired','Get out of my way.','You suck']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
test = ['Beach.', 'Sunny.','Lakers.','Dodgers.']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
test = ['How are you.', 'Great job.','I love you.','I hate you.']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

## How well does a decision tree perform?

In [34]:
pipe = Pipeline([
    ('tf', CountVectorizer(min_df=2)),
    ('dtc', DecisionTreeClassifier()
)
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

ValueError: np.nan is an invalid document, expected byte or unicode string.

In [None]:
pred = pipe.predict(X_test)

In [None]:
test = pd.DataFrame(pred)

In [None]:
test[0].value_counts(normalize=True)

### Test phrases - Decision Tree

In [None]:
test = ['Happy', 'Nice','Smile','Fun']

In [None]:
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
test = ['Sad', 'Unhappy','Mad','Angry']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
test = ['Get away from me.', 'Your fired','Get out of my way.','You suck']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
test = ['How are you.', 'Great job.','I love you.','I hate you.']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

### Multinomial

In [None]:
pipe = Pipeline([
    ('cv', CountVectorizer(min_df=2)),
    ('mnb', MultinomialNB())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

###  Extra Tree

In [None]:
pipe = Pipeline([
    ('cv', CountVectorizer(min_df=2)),
    ('mnb', ExtraTreesClassifier())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

### AdaBoost

In [None]:
pipe = Pipeline([
    ('cv', CountVectorizer(min_df=2)),
    ('mnb', AdaBoostClassifier())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

### KNN

In [None]:
pipe = Pipeline([
    ('cv', CountVectorizer(min_df=2)),
    ('mnb', KNeighborsClassifier())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

### SVC

In [None]:
pipe = Pipeline([
    ('cv', CountVectorizer(min_df=2)),
    ('svc', SVC())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

### Random Forest

In [None]:
pipe = Pipeline([
    ('cv', CountVectorizer(min_df=2)),
    ('mnb', RandomForestClassifier())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

In [None]:
pred = pipe.predict(X_test)

In [None]:
test = pd.DataFrame(pred)

In [None]:
test[0].value_counts(normalize=True)

In [None]:
# visualize this

plot_confusion_matrix(pipe, X_test, y_test, cmap='Blues', values_format='d');

In [None]:
from sklearn.metrics import classification_report

In [None]:
print(classification_report(y_test, pred))

### Model XG Boost

In [None]:
xgb = XGBClassifier()

In [None]:
pipe = Pipeline([
    ('cv', TfidfVectorizer(min_df=2)),
    ('xgb', XGBClassifier())
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

In [None]:
pred = pipe.predict(X_test)

In [None]:
test = pd.DataFrame(pred)

In [None]:
test[0].value_counts(normalize=True)

In [None]:
df = test

### XGB gridsearch

In [None]:
# xgb_params = {
#     'n_estimators': range(0, 100, 500),
#     'max_depth': range(3, 6, 10),
#     'gamma':[0, 0.2, 0.4],
#     'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
#     'learning_rate':[1e-2, 0.1, 0.2]
# }

In [None]:
xgb_params = {
    #'n_estimators':[0, 100, 500],
    #'max_depth':[3, 6, 10],
    #'gamma':[0, 0.2, 0.4],
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100],
    'learning_rate':[1e-2, 0.1, 0.2]
}

In [None]:
# Instantiate our GridSearchCV object.
new_xgb = XGBClassifier()
xgb_gridsearch = GridSearchCV(estimator=new_xgb, # What is the model we want to fit?
                              #objective='binary:logistic',
                              param_grid=xgb_params, # What is the dictionary of hyperparameters?
                              n_jobs=4,
                              cv=5) # What number of folds in CV will we use?
                              #verbose=1) # Some output

#source: https://www.analyticsvidhya.com/blog/2016/03/complete-guide-parameter-tuning-xgboost-with-codes-python/

In [None]:
xgb_gridsearch

### CountVectorize

In [None]:
NYC_LA_df['selftext'].isnull().sum()

In [None]:
# test_dropna = NYC_LA_df.dropna(subset=['selftext'])

In [None]:
# test_dropna['selftext'].isnull().sum()

In [None]:
cv = CountVectorizer(min_df=2)
cv.fit(X_train)

X_train_cv = cv.transform(X_train)
X_test_cv = cv.transform(X_test)

In [None]:
# Fit the GridSearchCV object to the data
xgb_gridsearch.fit(X_train_cv, y_train);

In [None]:
# Displays hyperparameters searched over.
xgb_gridsearch.param_grid

In [24]:
# Displays hyperparameters searched over.
xgb_gridsearch.param_grid

NameError: name 'xgb_gridsearch' is not defined

In [None]:
# Print out the score.
# from documentation: Mean cross-validated score of the best_estimator
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
xgb_gridsearch.best_score_

In [None]:
# Print out the score.
# from documentation: Mean cross-validated score of the best_estimator
# https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html
xgb_gridsearch.best_score_

In [None]:
# Print out the set of hyperparameters that achieved the best score.
xgb_gridsearch.best_params_

In [None]:
# Print out the set of hyperparameters that achieved the best score.
xgb_gridsearch.best_params_

In [None]:
# Print out the best model/estimator
xgb_gridsearch.best_estimator_

In [None]:
# Evaluate the best fit model on the test data.
xgb_gridsearch.score(X_test_cv, y_test)

In [None]:
# Evaluate the best fit model on the test data.
xgb_gridsearch.score(X_test_cv, y_test)

### Top XGB parameters (had to process separately)

In [None]:
pipe = Pipeline([
    ('cv', TfidfVectorizer(min_df=2)),
    ('xgb', XGBClassifier(gamma=0.4,
                          max_depth=10,
                          n_estimators=100, 
                          learning_rate=0.2,
                          reg_alpha=1,
                          n_jobs=4,
                          cv=5))
])

pipe.fit(X_train, y_train)
pipe.score(X_train, y_train), pipe.score(X_test, y_test)

In [None]:
pred = pipe.predict(X_test)

In [None]:
test = pd.DataFrame(pred)

In [None]:
test[0].value_counts(normalize=True)

In [None]:
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

In [None]:
# Get predictions
preds = pipe.predict(X_test)

# Save confusion matrix values
tn, fp, fn, tp = confusion_matrix(y_test, preds).ravel()

In [None]:
# Calculate the specificity

spec = tn / (tn + fp)

print('Specificity:', spec)

In [None]:
# visualize this

plot_confusion_matrix(pipe, X_test, y_test, cmap='Blues', values_format='d');

In [None]:
# Calculate the specificity

spec = tn / (tn + fp)

print('Specificity:', spec)

In [None]:
# Calculate the recall

recall = tn / (tn + fn)

print('Recall:', recall)

In [None]:
# Calculate the precision

prec = tp / (tp + fp)

print('Precision:', prec)

In [None]:
# Calculate the F1

F1 = 2*((prec*recall)/(prec+recall))

print('F1:', F1)

In [None]:
#1000 datapoints
test = ['NYPD','FDNY','subway','train','car','LAPD','LA', 'NYC','Los Angeles','New York']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#2000 datapoints
test = ['NYPD','FDNY','subway','train','car','LAPD','LA', 'NYC','Los Angeles','New York']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#4000 datapoints
test = ['NYPD','FDNY','subway','train','car','LAPD','LA', 'NYC','Los Angeles','New York']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#1000 datapoints
test = ['Happy', 'Nice','Smile','Fun']

In [None]:
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#2000 datapoints
test = ['Happy', 'Nice','Smile','Fun']

In [None]:
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#4000 datapoints
test = ['Happy', 'Nice','Smile','Fun']

In [None]:
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#1000 datapoints
test = ['Sad', 'Unhappy','Mad','Angry']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#2000 datapoints
test = ['Sad', 'Unhappy','Mad','Angry']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#1000 datapoints
test = ['Get away from me.', 'Your fired','Get out of my way.','You suck']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#4000 datapoints
test = ['Get away from me.', 'Your fired','Get out of my way.','You suck']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#2000 datapoints
test = ['Get away from me.', 'Your fired','Get out of my way.','You suck']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#1000 datapoints
test = ['How are you.', 'Great job.','I love you.','I hate you.']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#2000 datapoints
test = ['How are you.', 'Great job.','I love you.','I hate you.']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#4000 datapoints
test = ['How are you.', 'Great job.','I love you.','I hate you.']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#1000 datapoints
test = ['Happy', 'Nice','Smile','Fun']

In [None]:
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#2000 datapoints
test = ['Happy', 'Nice','Smile','Fun']

In [None]:
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#4000 datapoints
test = ['Happy', 'Nice','Smile','Fun']

In [None]:
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#1000 datapoints
test = ['Sad', 'Unhappy','Mad','Angry']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#2000 datapoints
test = ['Sad', 'Unhappy','Mad','Angry']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#4000 datapoints
test = ['Sad', 'Unhappy','Mad','Angry']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#4000 datapoints
test = ['How are you.', 'Great job.','I love you.','I hate you.']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#1000 datapoints
test = ['Get away from me.', 'Your fired','Get out of my way.','You suck']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#1000 datapoints
test = ['How are you.', 'Great job.','I love you.','I hate you.']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

In [None]:
#2000 datapoints
test = ['How are you.', 'Great job.','I love you.','I hate you.']
test = pd.DataFrame(test)
new_pred =pipe.predict(test[0])
new_pred

### Make a dataframe of accuracies

In [None]:
df=pd.DataFrame({"Accuracies":[79,77.2,76.6,75,73.6,72.8,72.4,68,63 ],
                "w/ stop_words":[76.7,'',73.3,74,'','','','','' ]
                })

In [None]:
df.index = ['XGBoost','LogReg','DecTree','AdaBoost','ExtraTree','SVC','RandFor','MultiNB','KNN']
df

### Make a dataframe of accuracies

In [None]:
df=pd.DataFrame({"Accuracies":[79,77.2,76.6,75,73.6,72.8,72.4,68,63 ],
                "w/ stop_words":[76.7,'',73.3,74,'','','','','' ]
                })

In [None]:
df.index = ['XGBoost','LogReg','DecTree','AdaBoost','ExtraTree','SVC','RandFor','MultiNB','KNN']
df

### Make a dataframe of phrase predictions

In [None]:
df=pd.DataFrame({"Predictio`n":['AskNYC','AskNYC','AskNYC','AskNYC','AskNYC','AskNYC','AskLA','AskNYC','AskLA','AskNYC' ]
                })

In [None]:
df.index = ['NYPD','FDNY','Subway','train','car','LAPD','LA','NYC','Los Angeles','New York']
df

In [None]:
df=pd.DataFrame({"Prediction":['AskNYC','AskNYC','AskNYC','AskNYC','AskNYC','AskNYC','AskNYC','AskNYC']
                })

In [None]:
df.index = ['Happy','Nice','Smile','Fun','Sad','Unhappy','Mad','Angry']
df

In [None]:
df=pd.DataFrame({"Prediction":['AskNYC','AskNYC','AskLA','AskNYC','AskLA','AskLA']
                })

In [None]:
df.index = ['Get away from me','Get out of my way','How are you','Great job','I love you','I hate you']
df

### Make a dataframe of precision, recall and F1

In [None]:
df=pd.DataFrame({"XGBoost":[79.2,78.0,79.7,78.6]
                })

In [None]:
df.index = ['Precision','Recall','Specificity','F1']
df