In [None]:
# for model 3: author, 

# Problem Statement:

### Can the text of a subreddit post's `title` and `selftext` reliably predict if a post is 'good advice' or 'bad advice'?

    
    
### Predictors and Target Variable:

**Model 1.2:**
- The predictor variables are `title`, and `selftext`.
- The target variable is `subreddit`.

### Pipeline & GridSearch:
- The predictor variables will be passed into the CountVectorizer transformer.

In [493]:
import pandas as pd
import numpy as np
import requests
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
import regex as re
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.naive_bayes import MultinomialNB

# Create JSON Files via API:

In [2]:
def generate_json_posts(subreddit_str, size):
    
    # Setup URL of API
    base_url = "https://api.pushshift.io/reddit/search/submission"    
    
    # Create the params of the API URL
    params = {
        "subreddit": subreddit_str,
        "size": size
    }

    # Response
    res = requests.get(base_url, params)
    res_check = res.status_code
    
    # Check response is good
    if (res_check >= 200 and res_check < 300):
        
        # Create JSON:
        data = res.json()
        posts = data["data"]
        
        return posts
    else:
        return f"Check HTTP Error: {res_check}"

In [3]:
lpt_posts = generate_json_posts("LifeProTips", 500)
ulpt_posts = generate_json_posts("UnethicalLifeProTips", 500)

### Save JSON Files:

In [4]:
# pd.to_pickle(lpt_posts, "../datasets/lpt_posts_json")
# pd.to_pickle(ulpt_posts, "../datasets/ulpt_posts_json")

### Read JSON Files:

In [435]:
lpt_posts = pd.read_pickle("../datasets/lpt_posts_json")
ulpt_posts = pd.read_pickle("../datasets/ulpt_posts_json")

# Create Dataframe

### Model Features Set:

- `title`
    - The title of the post
- `selftext`
    - Included in the post, this is the 'content' of the post and appears under the title.
    - Not every post in LPT has `selftext` - Many appear with only a title


In [436]:
lpt_df = pd.DataFrame(lpt_posts)
ulpt_df = pd.DataFrame(ulpt_posts)

In [437]:
df = lpt_df.append(ulpt_df, ignore_index=True)

In [438]:
df.shape

(1000, 70)

In [439]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 70 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   all_awardings                  1000 non-null   object 
 1   allow_live_comments            1000 non-null   bool   
 2   author                         1000 non-null   object 
 3   author_flair_css_class         0 non-null      object 
 4   author_flair_richtext          876 non-null    object 
 5   author_flair_text              0 non-null      object 
 6   author_flair_type              876 non-null    object 
 7   author_fullname                876 non-null    object 
 8   author_patreon_flair           876 non-null    object 
 9   author_premium                 876 non-null    object 
 10  awarders                       1000 non-null   object 
 11  can_mod_post                   1000 non-null   bool   
 12  contest_mode                   1000 non-null   bo

In [440]:
df["selftext"].head()

0    [removed]
1    [removed]
2    [deleted]
3             
4    [removed]
Name: selftext, dtype: object

## Exploration:

- Investigate additional features: `score`, `author`, `over_18`

    - `author` 
         - Set cvec hyparam 'lowercase' == False
         - Has a number of `[deleted]` author names

- Notes for next model:
    - `created_utc` & `retrieved_on` as features

In [441]:
# Column names
df.columns

Index(['all_awardings', 'allow_live_comments', 'author',
       'author_flair_css_class', 'author_flair_richtext', 'author_flair_text',
       'author_flair_type', 'author_fullname', 'author_patreon_flair',
       'author_premium', 'awarders', 'can_mod_post', 'contest_mode',
       'created_utc', 'domain', 'full_link', 'gildings', 'id',
       'is_crosspostable', 'is_meta', 'is_original_content',
       'is_reddit_media_domain', 'is_robot_indexable', 'is_self', 'is_video',
       'link_flair_background_color', 'link_flair_css_class',
       'link_flair_richtext', 'link_flair_template_id', 'link_flair_text',
       'link_flair_text_color', 'link_flair_type', 'locked', 'media_only',
       'no_follow', 'num_comments', 'num_crossposts', 'over_18',
       'parent_whitelist_status', 'permalink', 'pinned', 'pwls',
       'removed_by_category', 'retrieved_on', 'score', 'selftext',
       'send_replies', 'spoiler', 'stickied', 'subreddit', 'subreddit_id',
       'subreddit_subscribers', 'subre

# Feature Engineering & Preprocessing

### Cleaning:

- **HTML Artifacts:**
- **Non-Letters**
- **Stopwords**
- **Lemmatize words**

**Lemmatize:**

- This can help with some typos in our word analysis.
    - For example, we can use lemmatization to identify `untill`, and make a necessary adjustment to model input
- Lemmatization will not be applied to `author`, as these are the usernames attached to the post submission to the subreddit.

In [442]:
def to_lemma(data, col):
    lemma = WordNetLemmatizer()
    tokenizer = RegexpTokenizer(r'\w+')
    for i in range(len(data[col])):
        data[col][i] = tokenizer.tokenize(data[col][i])
        for j in data[col][i]:
            data[col][i] = [lemma.lemmatize(j) for j in data[col][i]]
    data[col] = data[col].apply(lambda i: " ".join(i))
    return data

### Define Stopwords
- In this iteration of the model, the `LPT` or `lpt` word will be removed from the `title` and `selftext` as a stopword.

In [443]:
stopset = set(nltk.corpus.stopwords.words("english"))
stopset.add("lpt")
stopset.add("lptrequest")

stopset.add("ulpt")
stopset.add("ulptrequest")

## Function to perform each Preprocessing task:

- The idea behind this function is to have a function to call on a given feeature that should have all of the preprocessing tasks performed, as listed above.
    - Otherwise, each of the above functions can be called on a feature as needed.

In [444]:
def clean_df(data, col):

    # Remove non-letters:
    new_lst = []
    for i in data[col]:
        soup = BeautifulSoup(i, "lxml")
        new_lst.append(re.sub("[^a-zA-Z]", " ", soup.get_text()))
    data[col] = new_lst
    # Reference to: https://www.reddit.com/r/learnpython/comments/an62wx/how_to_remove_html_from_pandas_dataframe_without/
    
    
    # Make lowercase:
    data[col] = data[col].str.lower()
    

    # Lemmatize:
    for i in range(len(data[col])):
        data[col][i] = tokenizer.tokenize(data[col][i])
        for j in data[col][i]:
            data[col][i] = [lemma.lemmatize(j) for j in data[col][i]]
    data[col] = data[col].apply(lambda i: " ".join(i))
    

    # Remove Stopwords:
    data[col] = [" ".join([i for i in x.split()
                           if i not in stopset])
                           for x in data[col]]
    
    return data

In [445]:
clean_df(df, "title");

  ' that document to Beautiful Soup.' % decoded_markup
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [446]:
df.shape

(1000, 70)

### Binarize target `y` variable

In [447]:
df["subreddit"].value_counts()

LifeProTips             500
UnethicalLifeProTips    500
Name: subreddit, dtype: int64

In [448]:
# Create numeric values for y var to be passed into model

df["subreddit"] = df["subreddit"].map({"LifeProTips": 1,
                                       "UnethicalLifeProTips": 0
                                      })

In [449]:
df["subreddit"].value_counts()

1    500
0    500
Name: subreddit, dtype: int64

### Column for if post contains `selftext` called  `has_selftext`

In [450]:
df["selftext"].isna().sum()

df["has_selftext"] = df["selftext"].notnull().astype(int)

print((df["has_selftext"] == 1).sum())

print((df["has_selftext"] == 0).sum())

909
91


### Clean the selftext:

- Not every subreddit post has `selftext`, which is the body of a post.
    - It seems clear that most of the text context of the subreddits are contained in the `title` field, making it an important `predictor` variable.
- Issues with `selftext`:
    - `[removed]`
        - These will be left in for this model.
    - `[deleted]`
        - These will be left in for this model.
    - Contains emojis
    - NaN
        - Every collection of 500 posts seems to have just under 10% of posts have missing values for the `selftext` field

In [451]:
print((df["selftext"] == "[removed]").sum())
removed_selftext = (df["selftext"] == "[removed]").sum()
print(f"Percent [removed]: {(removed_selftext / len(df)) * 100}%")

print((df["selftext"] == "[deleted]").sum())
deleted_selftext = (df["selftext"] == "[deleted]").sum()
print(f"Percent [deleted]: {(deleted_selftext / len(df)) * 100}%")

print((df["selftext"].isna().sum()))
nan_selftext = df["selftext"].isna().sum()
print(f"Percent NaN: {(nan_selftext / len(df)) * 100}%")

total_selftext_probs = removed_selftext + deleted_selftext + nan_selftext
print(total_selftext_probs)
print(f"Percent Total Probs: {(total_selftext_probs / len(df)) * 100}%")

512
Percent [removed]: 51.2%
33
Percent [deleted]: 3.3000000000000003%
91
Percent NaN: 9.1%
636
Percent Total Probs: 63.6%


In [452]:
df.dropna(subset=["selftext"], axis=0, inplace=True)

In [453]:
df["selftext"].isna().sum()

0

In [454]:
df["selftext"] = df["selftext"].str.replace("[^a-zA-Z]", " ")

In [455]:
df["selftext"] = df["selftext"].str.lower()

In [456]:
df["selftext"].unique()

array([' removed ', ' deleted ', '',
       ' annualcreditreport com is now offering free weekly credit report from all three bureaus until april        usually  and by law  this is only offered once per year   from annualcreditreport com     during these times   accessing your credit is important  that s why equifax  experian  and transunion are now offering free weekly online reports through april           with id theft rampant nowadays  it s always a good idea to check your credit report on a regular basis ',
       'i want to clean my home and keep it clean  i want to workout daily  get off my phone and work a job all the time  i want to move up in life but a lot of times i lack the motivation   what has helped you ',
       'most real estate contracts require items permanently attached to a wall to remain  and in some cases that includes doorbells   well  if you have a fancy video doorbell  it must stay unless you plan ahead ',
       'it also works if you want to use an image fr

### Save Dataframe:

In [457]:
pd.to_pickle(df, "../datasets/df_model_1.2")

# First Model

In [475]:
features = df["title"]

In [476]:
X = features
y = df["subreddit"]

### Train/Test Split

In [477]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y,
                                                    test_size=0.33,
                                                    random_state=42)

### Baseline Accuracy

In [478]:
y_test.value_counts(normalize=True)

1    0.536667
0    0.463333
Name: subreddit, dtype: float64

## Pipeline (pipe)

In [502]:
log_reg_pipe = Pipeline([
    ("cvec", CountVectorizer()),
    ("knn", KNeighborsClassifier())
])

## GridSearchCV (gs)

**Looking at an untweaked model:**

In [503]:
# Testing the model performance on new data
print(f"CV Score Mean: {cross_val_score(pipe, X_train, y_train, cv=5).mean()}")

# Fit
log_reg_pipe.fit(X_train, y_train)

# Train Score
print(f"Train Score: {log_reg_pipe.score(X_train, y_train)}")

# Test Score
print(f"Test Score: {log_reg_pipe.score(X_test, y_test)}")

CV Score Mean: 0.7520796640021677
Train Score: 0.8078817733990148
Test Score: 0.66


**Notes:**
- Model is very overfit.
- As opposed to the very bad model made for `model-1.1`, these scores indicate slightly lower variance.

## GridSearchCV (gs)

In [505]:
logreg_model_params = {
    "cvec__ngram_range": [(1, 1), (1, 2)],     # 2
    "cvec__max_features": [1000, 2500, 5000],  # 3
    "cvec__min_df": [1, 2, 5, 10],             # 4
    "knn__n_neighbors": [3, 5, 9, 11],         # 4
    "knn__p": [1, 2]                           # 2
}


In [522]:
gs = GridSearchCV(log_reg_pipe,
                  logreg_model_params,
                  cv=5,
                  verbose=10)

In [523]:
gs.fit(X_train, y_train);

Fitting 5 folds for each of 192 candidates, totalling 960 fits
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=1, score=0.549, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=1, score=0.680, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=1, score=0.656, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=1, score=0.590, total=   0.0s
[CV] cvec

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:    0.2s remaining:    0.0s


[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=2, score=0.525, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=2, score=0.680, total=   0.1s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=2, score=0.672, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=2, score=0.615, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1),

[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:    0.4s remaining:    0.0s



[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1, score=0.746, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1, score=0.623, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1, score=0.656, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1, score=0.595, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 1),

[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2, score=0.689, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2, score=0.574, total=   0.1s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2, score=0.598, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2, score=0.678, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=1, cvec__ngram_range=(1, 2),

[CV]  cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=1, score=0.557, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=1, score=0.590, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=1, score=0.694, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=2, score=0.508, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 1),

[CV]  cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=2, score=0.680, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=2, score=0.702, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=1, score=0.508, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=1, score=0.680, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 

[CV]  cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=2, score=0.664, total=   0.1s
[CV] cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=2, score=0.615, total=   0.1s
[CV] cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=2, score=0.598, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=2, score=0.702, total=   0.1s
[CV] cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=3, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=

[CV]  cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=1, score=0.664, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=1, score=0.554, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=2, score=0.672, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=2, score=0.689, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range

[CV]  cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=1, score=0.680, total=   0.1s
[CV] cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=1, score=0.639, total=   0.1s
[CV] cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=1, score=0.570, total=   0.1s
[CV] cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=2, score=0.623, total=   0.1s
[CV] cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=5, cvec__ngram_range

[CV] cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=1, score=0.631, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=1, score=0.669, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=2, score=0.672, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=11, knn__p=2, score=0.689, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=10, cvec__ngr

[CV]  cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2, score=0.623, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2, score=0.661, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=1, score=0.672, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=1, score=0.672, total=   0.0s
[CV] cvec__max_features=1000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=11, knn__p=1 
[CV]  cvec__max_features=1000, cvec__min_df=10, cvec__ngram

[CV]  cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2, score=0.656, total=   0.1s
[CV] cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2, score=0.705, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2, score=0.639, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2, score=0.648, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 1),

[CV]  cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=1, score=0.512, total=   0.1s
[CV] cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2, score=0.557, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2, score=0.689, total=   0.1s
[CV] cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2, score=0.623, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=1, cvec__ngram_range=(1, 2),

[CV]  cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=1, score=0.631, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=1 
[CV]  cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=1, score=0.678, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2, score=0.516, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2, score=0.664, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 1),

[CV]  cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=1, score=0.639, total=   0.1s
[CV] cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=1 
[CV]  cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=1, score=0.603, total=   0.1s
[CV] cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2, score=0.557, total=   0.1s
[CV] cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2, score=0.680, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=2, cvec__ngram_range=(1, 2),

[CV]  cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=1, score=0.648, total=   0.1s
[CV] cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=1 
[CV]  cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=1, score=0.579, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2, score=0.656, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2, score=0.705, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 1),

[CV]  cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2, score=0.686, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=1 
[CV]  cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=1, score=0.689, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=1 
[CV]  cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=1, score=0.672, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=1 
[CV]  cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=1, score=0.680, total=   0.1s
[CV] cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=9, knn__p=1 
[CV]  cvec__max_features=2500, cvec__min_df=5, cvec__ngram_range=(1, 2),

[CV]  cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=2, score=0.686, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=1 
[CV]  cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=1, score=0.713, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=1 
[CV]  cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=1, score=0.705, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=1 
[CV]  cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=1, score=0.656, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=9, knn__p=1 
[CV]  cvec__max_features=2500, cvec__min_df=10, cvec__ngram_rang

[CV]  cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2, score=0.607, total=   0.1s
[CV] cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2, score=0.680, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2, score=0.582, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2, score=0.549, total=   0.0s
[CV] cvec__max_features=2500, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=2500, cvec__min_df=10, cvec__ngram_rang

[CV]  cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1, score=0.672, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1, score=0.736, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=2, score=0.680, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=2, score=0.664, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 1),

[CV]  cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1, score=0.590, total=   0.1s
[CV] cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1, score=0.636, total=   0.1s
[CV] cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2, score=0.590, total=   0.1s
[CV] cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2, score=0.639, total=   0.1s
[CV] cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=1, cvec__ngram_range=(1, 2),

[CV]  cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=2, score=0.566, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=2, score=0.697, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=2, score=0.664, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=2, score=0.639, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 1),

[CV]  cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1, score=0.598, total=   0.1s
[CV] cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1, score=0.672, total=   0.1s
[CV] cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1, score=0.603, total=   0.1s
[CV] cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2, score=0.566, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=2, cvec__ngram_range=(1, 2),

[CV]  cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1, score=0.607, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1, score=0.721, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1, score=0.656, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1, score=0.656, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 1),

[CV]  cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2, score=0.648, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2, score=0.661, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1, score=0.607, total=   0.1s
[CV] cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1, score=0.672, total=   0.1s
[CV] cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=5, cvec__ngram_range=(1, 2),

[CV]  cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1, score=0.730, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1, score=0.689, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1, score=0.574, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1, score=0.590, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 1), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=10, cvec__ngram_rang

[CV]  cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2, score=0.664, total=   0.1s
[CV] cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2, score=0.598, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2, score=0.656, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2 
[CV]  cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=3, knn__p=2, score=0.686, total=   0.0s
[CV] cvec__max_features=5000, cvec__min_df=10, cvec__ngram_range=(1, 2), knn__n_neighbors=5, knn__p=1 
[CV]  cvec__max_features=5000, cvec__min_df=10, cvec__ngram_rang

[Parallel(n_jobs=1)]: Done 960 out of 960 | elapsed:   42.1s finished


GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [511]:
print(f"Train Score: {gs.score(X_train, y_train)}")
print(f"Test Score: {gs.score(X_test, y_test)}")
print(f"Best Score: {gs.best_score_}")

Train Score: 0.8407224958949097
Test Score: 0.7
Best Score: 0.6945535835252675


In [530]:
gs.best_estimator_

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=1000, min_df=5,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('knn',
                 KNeighborsClassifier(algorithm='auto', leaf_size=30,
                                      metric='minkowski', metric_params=None,
                                      n_jobs=None, n_neighbors=3, p=2,
                                      weights='uniform'))],
         verbose=False)

In [516]:
gs.predict(X_test)

array([0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 0, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1,
       0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0,
       1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1,
       0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0,
       0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1])

In [517]:
gs.predict_proba(X_test)

array([[0.66666667, 0.33333333],
       [0.33333333, 0.66666667],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.66666667, 0.33333333],
       [0.66666667, 0.33333333],
       [0.66666667, 0.33333333],
       [0.        , 1.        ],
       [0.66666667, 0.33333333],
       [0.        , 1.        ],
       [0.66666667, 0.33333333],
       [1.        , 0.        ],
       [0.        , 1.        ],
       [0.33333333, 0.66666667],
       [0.        , 1.        ],
       [0.66666667, 0.33333333],
       [0.33333333, 0.66666667],
       [0.66666667, 0.33333333],
       [0.        , 1.        ],
       [0.66666667, 0.33333333],
       [0.66666667, 0.33333333],
       [0.        , 1.        ],
       [0.        , 1.        ],
       [0.33333333, 0.66666667],
       [0.66666667, 0.33333333],
       [0.33333333, 0.66666667],
       [0.        , 1.        ],
       [0.66666667, 0.33333333],
       [0.33333333, 0.66666667],
       [0.66666667, 0.33333333],
       [0.

In [531]:
best_params = gs.best_params_
best_params

{'cvec__max_features': 1000,
 'cvec__min_df': 5,
 'cvec__ngram_range': (1, 1),
 'knn__n_neighbors': 3,
 'knn__p': 2}

### Second Model Score Notes:

This model is absolutely overfit, but it seems like it's getting closer to a predicting the value of our target `y` variable.
 
 **Although the bias has increased (evident in how much worse it has become in prediction, even on the train data), it does seem to be getting closer to score equality with testing on data the model has not seen**