# Gathering Data

In [172]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV

```python
import pandas as pd
import time
import requests
import datetime as dt
def get_date(created):
    # get the date of post
    return dt.date.fromtimestamp(created)
def query_pushshift(subreddit, kind='submission', skip=4, times=450, 
                    subfield = ['title', 'selftext', 'subreddit', 'created_utc', 'author', 'num_comments',
                                'score', 'is_self'],
                    comfields = ['body', 'score', 'created_utc']):
    # get the base url that contains information I want to scrape where 'kind' are all submitted posts
    # and 'subreddit' is the specified subreddit. Get 500 posts.
    stem = f"https://api.pushshift.io/reddit/search/{kind}/?subreddit={subreddit}&size=500"
    # instantiate list to contain 
    mylist = []
    # scrape posts from the subreddit 'times' times
    for x in range(1, times + 1):
        # Get posts 'skip' * 'x' days ago
        URL = f"{stem}&after={skip * x}d"
        print(URL)
        # Scrape URL
        response = requests.get(URL)
        # Give me an AssertionError if status code not 200
        assert response.status_code == 200
        # Of the HTML scraped, take the values of 'data'
        the_json=response.json()
        no_blanks=[c for c in the_json['data'] if ('selftext' in c.keys()) and len(c['selftext'])>10]
        # turn the data into a dataframe
        df = pd.DataFrame.from_dict(no_blanks)
        # append the dataframe to mylist
        mylist.append(df)
        # wait to not overrun Reddit's resources
        time.sleep(3)
    # concatenate the dataframes together as one large dataframe, full
    full = pd.concat(mylist, sort=False)
    if kind == "submission":
        # take all speficied data
        full = full[subfield]
        # drop duplicate rows
        full = full.drop_duplicates()
        full = full.loc[full['is_self'] == True]
    # date the the post was... posted
    _timestamp = full["created_utc"].apply(get_date)
    full['timestamp'] = _timestamp
    print(full.shape)
    return full
```

```python
df1 = query_pushshift('artificial')
df = query_pushshift('MachineLearning')
```

## Code above was used to web scrape reddit 

### Saving csv files

In [315]:
df1= pd.read_csv('./final1_csv')

In [314]:
df = pd.read_csv('./final_csv')

In [231]:
final = pd.concat([df,df1])

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  """Entry point for launching an IPython kernel.


### Concat

In [232]:
final = final[final['author'] != 'AutoModerator']
final.drop(columns =['Unnamed: 0','timestamp', 'score', 'is_self','num_comments'], inplace=True)
final.to_csv('./final_data_set.csv')

In [233]:
final['subreddit'] = final['subreddit'].map({'MachineLearning': 0, 'artificial': 1})

In [234]:
final.head()

Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,author,created_utc,selftext,subreddit,title
0,,,__czyzewski,1579903195,**Project/Source Code:** [https://github.com/m...,0,[R] [P] batchboost: regularization for stabili...
1,,,sculptor311,1579904755,Hallo\n\nIch suche Teilnehmer für meine Umfra...,0,[P] Umfrage bezüglich künstlicher Intelligenz ...
2,,,ExtremeSavings,1579909752,What is currently considered the state of the ...,0,[D] asynchronous stochastic gradient descent
3,,,whria78,1579910017,https://journals.plos.org/plosone/article?id=1...,0,[R] Pigeons classify breast cancer as well as ...
4,,,insanelylogical,1579915874,A lot of people I have talked to seem to think...,0,[D] Thoughts on why training a network from a ...


In [308]:
final

Unnamed: 0,Unnamed: 0.1,Unnamed: 0.1.1,author,created_utc,selftext,subreddit,title
0,,,__czyzewski,1579903195,**Project/Source Code:** [https://github.com/m...,0,[R] [P] batchboost: regularization for stabili...
1,,,sculptor311,1579904755,Hallo\n\nIch suche Teilnehmer für meine Umfra...,0,[P] Umfrage bezüglich künstlicher Intelligenz ...
2,,,ExtremeSavings,1579909752,What is currently considered the state of the ...,0,[D] asynchronous stochastic gradient descent
3,,,whria78,1579910017,https://journals.plos.org/plosone/article?id=1...,0,[R] Pigeons classify breast cancer as well as ...
4,,,insanelylogical,1579915874,A lot of people I have talked to seem to think...,0,[D] Thoughts on why training a network from a ...
...,...,...,...,...,...,...,...
5230,5230.0,1.0,hampthecoolest,1424881406,I recently have been replaying mega man battle...,1,"How many decades until ""Net Navi's"" could be a..."
5231,5231.0,2.0,Thistleknot,1424909230,Anyone know any good ones? Maybe with [pseudo...,1,Recurrent neural network tutorials?
5232,5232.0,3.0,kazein,1424914198,"-So, I did a quick search and couldn't find an...",1,Scripting computer opponent AI for Star Wars:G...
5233,5233.0,4.0,Thistleknot,1425006236,Trying a simple feed forward ANN.\n\nI know to...,1,Question on threshold's and weights for an ANN.


In [235]:
X = final['selftext']
y = final['subreddit']

In [236]:
X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.33,
                                                    stratify=y,
                                                    random_state=42)

In [237]:
from bs4 import BeautifulSoup
import regex as re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [238]:
lemmatizer = WordNetLemmatizer()
p_stemmer = PorterStemmer()

In [239]:
def to_words(raw_review):
    # Function to convert a raw review to a string of words
    # The input is a single string (a raw movie review), and 
    # the output is a single string (a preprocessed movie review)
    
    # 1. Remove HTML.
    review_text = BeautifulSoup(raw_review).get_text()
    
    # 2. Remove non-letters.
    letters_only = re.sub("[^a-zA-Z]", " ", review_text)
    
    # 3. Convert to lower case, split into individual words.
    words = letters_only.lower().split()
    
    # 4. In Python, searching a set is much faster than searching
    # a list, so convert the stop words to a set.
    stops = set(stopwords.words('english'))
    
    # 5. Remove stop words.
    meaningful_words = [w for w in words if w not in stops]
    lemma_words = [lemmatizer.lemmatize(a) for a in meaningful_words]
    stem_words = [p_stemmer.stem(i) for i in lemma_words]
    # 6. Join the words back into one string separated by space, 
    # and return the result.
    return(" ".join(stem_words))

In [240]:
def conver_str (str_self):
    new_machine_self = []
    for machine in str_self:
        # Convert review to words, then append to clean_train_reviews.
        new_machine_self.append(to_words(machine))
    return new_machine_self

In [241]:
X_train = conver_str(X_train)
X_test = conver_str(X_test)

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that d

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup

https://itunes.apple.com/us/app/summize/id1087195099?mt=8" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to

  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
https://facebook.com/pg/icml.imls/videos/" looks like a URL. Beautiful Soup is not an HTTP client. You should probably use an HTTP client like requests to get the document behind the URL, and feed that document to Beautiful Soup.
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup
  ' that document to Beautiful Soup.' % decoded_markup


In [312]:
vect = TfidfVectorizer(max_features=20)
tfidf_matrix = vect.fit_transform(X_train)
dfr = pd.DataFrame(tfidf_matrix.toarray(), columns = vect.get_feature_names())
dfr

Unnamed: 0,ai,com,data,github,http,human,intellig,know,learn,like,machin,model,one,think,time,train,use,work,would,www
0,0.340121,0.342212,0.000000,0.000000,0.581524,0.000000,0.494942,0.000000,0.000000,0.000000,0.000000,0.000000,0.429035,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
1,0.000000,0.000000,0.226506,0.000000,0.000000,0.000000,0.000000,0.000000,0.180845,0.000000,0.230638,0.455168,0.221728,0.000000,0.0,0.726740,0.000000,0.207505,0.187738,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.450791,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.726720,0.518330,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.637934,0.000000,0.547324,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.541735,0.000000
4,0.000000,0.269928,0.000000,0.000000,0.458690,0.000000,0.000000,0.000000,0.552026,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.527413,0.000000,0.000000,0.365849
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7149,0.000000,0.239817,0.000000,0.362325,0.203761,0.000000,0.000000,0.149888,0.000000,0.000000,0.000000,0.617202,0.000000,0.000000,0.0,0.492726,0.351435,0.000000,0.000000,0.000000
7150,0.000000,0.470707,0.000000,0.000000,0.399938,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.459858,0.000000,0.000000,0.637977
7151,0.000000,0.499716,0.000000,0.754991,0.424585,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000
7152,0.000000,0.000000,0.000000,0.000000,0.539751,0.000000,0.000000,0.397043,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.310309,0.000000,0.674340,0.000000


In [313]:
coun = CountVectorizer(max_features=20)
coun_matrix = coun.fit_transform(X_train)
dfc = pd.DataFrame(coun_matrix.toarray(), columns = coun.get_feature_names())
dfc

Unnamed: 0,ai,com,data,github,http,human,intellig,know,learn,like,machin,model,one,think,time,train,use,work,would,www
0,1,1,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0
1,0,0,1,0,0,0,0,0,1,0,1,2,1,0,0,3,0,1,1,0
2,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0
3,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0
4,0,1,0,0,2,0,0,0,2,0,0,0,0,0,0,0,2,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7149,0,2,0,2,2,0,0,1,0,0,0,4,0,0,0,3,3,0,0,0
7150,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1
7151,0,2,0,2,2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7152,0,0,0,0,2,0,0,1,0,0,0,0,0,0,0,0,1,0,2,0


## Modeling 

In [243]:
pipe = Pipeline([
    ('cvec', CountVectorizer()),
    ('lr', LogisticRegression(max_iter = 500))
])
pipe_params = {
    'cvec__max_features': [2500, 3000, 3500],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)]
}
gs = GridSearchCV(pipe, 
                  param_grid=pipe_params, 
                  cv=5)

In [244]:
gs.fit(X_train, y_train);

In [245]:
y_test.value_counts(normalize=True)

0    0.509648
1    0.490352
Name: subreddit, dtype: float64

In [262]:
print(f"Train score: {gs.score(X_train, y_train)}")
print(f"Test score: {gs.score(X_test, y_test)}")

Train score: 0.961839530332681
Test score: 0.8090238365493757


In [247]:
best_cvec_logi = gs.best_estimator_

In [248]:
pipe = Pipeline([
    ('tvec', TfidfVectorizer()),
    ('lr', LogisticRegression(max_iter = 500))
])
pipe_params = {
    'tvec__max_features': [2500, 3000, 3500],
    'tvec__min_df': [2, 3],
    'tvec__max_df': [.9, .95],
    'tvec__ngram_range': [(1,1), (1,2)]
}
gs_tvec = GridSearchCV(pipe, 
                  param_grid=pipe_params, 
                  cv=5)

In [249]:
gs_tvec.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tvec',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        no

In [251]:
print(gs_tvec.best_score_)

0.8342203847864227


In [261]:
print(f"Train score: {gs_tvec.score(X_train, y_train)}")
print(f"Test score: {gs_tvec.score(X_test, y_test)}")

Train score: 0.8860777187587364
Test score: 0.8433598183881952


In [318]:
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB

In [316]:
pipe_ve = Pipeline([
                 ('tvec', TfidfVectorizer()),
                 ('mul', MultinomialNB())
])

pipe_params_mul_ve = {
    'tvec__max_features': [2500, 3000, 3500],
    'tvec__min_df': [2, 3],
    'tvec__max_df': [.9, .95],
    'tvec__ngram_range': [(1,1), (1,2)]
}
mul_ve = GridSearchCV(pipe_ve, 
                  param_grid=pipe_params_mul_ve, 
                  cv=5)

In [317]:
mul_ve.fit(X_train,y_train);
print(f"Train score: {mul_ve.score(X_train, y_train)}")
print(f"Test score: {mul_ve.score(X_test, y_test)}")

Train score: 0.8419066256639642
Test score: 0.8095913734392736


In [324]:
pipe_ber_t = Pipeline([
                 ('tvec', TfidfVectorizer()),
                 ('ber', BernoulliNB())
])

pipe_params_mul_ber_t = {
    'tvec__max_features': [2500, 3000, 3500],
    'tvec__min_df': [2, 3],
    'tvec__max_df': [.9, .95],
    'tvec__ngram_range': [(1,1), (1,2)]
}
ber_tvec = GridSearchCV(pipe_ber_t, 
                  param_grid=pipe_params_mul_ber_t, 
                  cv=5)
ber_tvec.fit(X_train, y_train)
print(ber_tvec.score(X_train, y_train))
print(ber_tvec.score(X_test, y_test))

0.8214984623986581
0.8030646992054483


In [322]:
pipe_ber = Pipeline([
                 ('cvec', CountVectorizer()),
                 ('ber', BernoulliNB())
])

pipe_params_mul_ber = {
    'cvec__max_features': [2500, 3000, 3500],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)]
}
ber_cvec = GridSearchCV(pipe_ber, 
                  param_grid=pipe_params_mul_ber, 
                  cv=5)
ber_cvec.fit(X_train, y_train)
print(ber_cvec.score(X_train, y_train))
print(ber_cvec.score(X_test, y_test))

0.8214984623986581
0.8030646992054483


In [255]:
pipe_c = Pipeline([
                 ('cvec', CountVectorizer()),
                 ('mul', MultinomialNB())
])

pipe_params_mul_cvec = {
    'cvec__max_features': [2500, 3000, 3500],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)]
}
mul_cvec = GridSearchCV(pipe_c, 
                  param_grid=pipe_params_mul_cvec, 
                  cv=5)

In [256]:
mul_cvec.fit(X_train,y_train);

GridSearchCV(cv=5, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('cvec',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        prep

In [257]:
mul_cvec.best_estimator_

Pipeline(memory=None,
         steps=[('cvec',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=0.9,
                                 max_features=3500, min_df=3,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('mul',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [264]:
print(f"Train score: {mul_cvec.score(X_train, y_train)}")
print(f"Test score: {mul_cvec.score(X_test, y_test)}")

Train score: 0.8247134470226447
Test score: 0.804483541430193


In [274]:
from sklearn.preprocessing import FunctionTransformer

In [297]:
tf_ = TfidfVectorizer(max_features=1800, max_df=.95, ngram_range=(1,2), min_df=2)
tf_.fit(X_train, y_train)
X_train_tf = tf_.transform(X_train).todense()
X_test_tf = tf_.transform(X_test).todense()

gnb = GaussianNB()

gnb.fit(X_train_tf, y_train)
print(gnb.score(X_train_tf, y_train))
print(gnb.score(X_test_tf, y_test))

0.8160469667318982
0.7829171396140749


In [298]:
from sklearn.tree import DecisionTreeClassifier

In [303]:
pipe_tree = Pipeline([
                 ('cvec', CountVectorizer()),
                 ('dt', DecisionTreeClassifier(random_state = 42))
])
pipe_params_tree = {
    'cvec__max_features': [2500, 3000, 3500],
    'cvec__min_df': [2, 3],
    'cvec__max_df': [.9, .95],
    'cvec__ngram_range': [(1,1), (1,2)],
    'dt__max_depth': [10],
    'dt__min_samples_split': [5,7],
    'dt__min_samples_leaf': [2,3,4]
}
tree_cvec = GridSearchCV(pipe_tree, 
                  param_grid=pipe_params_tree, 
                  cv=5)
tree_cvec.fit(X_train, y_train)
print(tree_cvec.score(X_train, y_train))
print(tree_cvec.score(X_test, y_test))

0.8331003634330445
0.7735527809307605


In [306]:
pipe_tree_tvec = Pipeline([
                 ('tvec', TfidfVectorizer()),
                 ('dt', DecisionTreeClassifier(random_state = 42))
])
pipe_params_tree_tvec = {
    'tvec__max_features': [3000, 3500],
    'tvec__min_df': [2, 3],
    'tvec__max_df': [.9, .95],
    'tvec__ngram_range': [(1,1), (1,2)],
    'dt__max_depth': [10,15,20],
    'dt__min_samples_split': [3,5],
    'dt__min_samples_leaf': [2,3,4]
}
tree_tvec = GridSearchCV(pipe_tree_tvec, 
                  param_grid=pipe_params_tree_tvec, 
                  cv=5)
tree_tvec.fit(X_train, y_train)
print(tree_tvec.score(X_train, y_train))
print(tree_tvec.score(X_test, y_test))

0.8381325132792843
0.771566401816118


In [305]:
tree_tvec.best_params_

{'dt__max_depth': 10,
 'dt__min_samples_leaf': 2,
 'dt__min_samples_split': 5,
 'tvec__max_df': 0.9,
 'tvec__max_features': 3000,
 'tvec__min_df': 2,
 'tvec__ngram_range': (1, 1)}

In [307]:
tree_tvec.best_params_

{'dt__max_depth': 10,
 'dt__min_samples_leaf': 2,
 'dt__min_samples_split': 3,
 'tvec__max_df': 0.9,
 'tvec__max_features': 3000,
 'tvec__min_df': 2,
 'tvec__ngram_range': (1, 1)}

In [325]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV

In [326]:
pipe_tree_tvec_r = Pipeline([
                 ('tvec', TfidfVectorizer()),
                 ('rf', RandomForestClassifier(random_state = 42))
])
rf_params = {
    'tvec__max_features': [3000, 3500],
    'tvec__min_df': [2, 3],
    'tvec__max_df': [.9, .95],
    'tvec__ngram_range': [(1,1), (1,2)],
    'rf__n_estimators': [100, 150, 200],
    'rf__max_depth': [None, 1, 2, 3, 4, 5],
}
gs_dr = GridSearchCV(pipe_tree_tvec_r, param_grid=rf_params, cv=5)
gs_dr.fit(X_train, y_train)
print(gs_dr.score(X_train, y_train))
print(gs_dr.score(X_test, y_test))

0.9967850153760134
0.837116912599319
