In [None]:
import requests
import pandas as pd
from bs4 import BeautifulSoup
import time
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
import patsy
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from scipy.optimize import minimize
from scipy.stats import t
import statsmodels.api as sm    #import statsmodels for the stats models
import statsmodels.formula.api as smf
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import ElasticNet
from sklearn.model_selection import KFold
%matplotlib inline

In [None]:
url = 'https://www.wikileaf.com/strains/'
response = requests.get(url)
wl = response.text
wl_soup = BeautifulSoup(wl,'lxml')

In [None]:
strains = wl_soup.find_all(class_='strain-title')

In [None]:
Strain_name = []
Strain_link = []

In [None]:
for div in strains: 
    link = div.find('a')
    Strain_name.append(link.text)
    Strain_link.append(link['href'])

In [None]:
genetics_ls = []
ind_sat = []
THC_content = []
parents_1 = []
parents_2 = []
reviews = []

In [None]:
#A helper method for pretty-printing linear models
def pretty_print_linear(coefs, names = None, sort = False):
    if names == None:
        names = ["X%s" % x for x in range(len(coefs))]
    lst = zip(coefs, names)
    if sort:
        lst = sorted(lst,  key = lambda x:-np.abs(x[0]))
    return " + ".join("%s * %s" % (round(coef, 3), name)
                                   for coef, name in lst)

def indica_pct(genetics, ind_sat):
    if ind_sat == 'Indica':
        return genetics
    elif ind_sat == 'Sativa':
        return 1 - genetics 
    elif ind_sat == 'Hybrid':
        return genetics
    
def sativa_pct(genetics, ind_sat):
    if ind_sat == 'Sativa':
        return genetics
    elif ind_sat == 'Indica':
        return 1 - genetics 
    elif ind_sat == 'Hybrid':
        return genetics 

In [None]:
for link in Strain_link:
    
   
    print('Souping %s ...' % link)

    try: 
        strain_url = link
        response = requests.get(strain_url)
        strain_page = response.text
        strain_soup = BeautifulSoup(strain_page, 'lxml')
        print('Success!')

    except Exception as e:
        print('Error souping %s' % link)
        print(e)
        pass 

    try: 
        print('Getting genetics data for %s ' % link)
        genetics = strain_soup.find(class_='strain-type-text')
        genetics = genetics.text.strip()
        genetics = genetics.split('%')
        genetics_ls.append(float(genetics[0])/100)
        ind_sat.append(genetics[1])
        print('Success!')

    except Exception as e:
        print('Unable to gather genetics data for %s ' % link)
        print(e)
        pass

    try:
        print('Getting THC data for %s ' % link )
        THC_tests = strain_soup.find_all(class_='graph-val')
        avg_thc = THC_tests[2].text
        THC_content.append(avg_thc)
        print('Success!')

    except Exception as e:
        print('Unable to gather THC data for %s ' % link)
        print(e)
        pass

    try: 
        print('Getting parent 1 for %s ' % link)
        parents = strain_soup.find(class_='strain-side-box parent').find_all('li')
        parent_1 = parents[0].text
        parents_1.append(parent_1)
        print('Success!')

    except Exception as e:
        print('No genealogy data for %s' % link)
        print(e)
        parents_1.append('NA')
#         parents_2.append('NA')
        pass

    try:
        print('Getting parent 2 for %s ' % link)
        parent_2 = parents[1].text
        parents_2.append(parent_2)
        print('Success!')

    except Exception as e:
        print('%s might be 1-parent strain' % link)
        print(e)
        parents_2.append('NA')
        pass

    try: 
        print('Getting review counts for %s ' % link)
        review_count = strain_soup.find(class_='review-counts').text
        review_count = review_count.strip()
        review_count = review_count.split(' ')[0]
        if review_count == 'Write':
            reviews.append(0)
        else: 
            reviews.append(review_count)
        print('Success!')

    except Exception as e: 
        print('Review problem with %s' % link)
        print(e)
        pass

    time.sleep(1)

In [None]:
genetics_df = pd.DataFrame({'strain': Strain_name, 'genetics': genetics_ls, 'ind_sat': ind_sat, 
                                     'thc_content': THC_content, 'parent_1': parents_1, 'parent_2': parents_2,
                                     'no_reviews': reviews}, columns = ['strain', 'genetics', 'ind_sat', 'thc_content',
                                                                        'parent_1', 'parent_2', 'no_reviews'])

In [None]:
genetics_df['thc_content'] = genetics_df['thc_content'].map(lambda x: x.rstrip('%'))
genetics_df['thc_content'] = genetics_df['thc_content'].apply(pd.to_numeric)
genetics_df['thc_content'].hist()

In [None]:
genetics_df['indica'] = genetics_df.apply(lambda row: indica_pct(row['genetics'], row['ind_sat']), axis = 1) # df['Value'] = df.apply(lambda row: my_test(row['a'], row['c']), axis=1)
genetics_df['sativa'] = genetics_df.apply(lambda row: sativa_pct(row['genetics'], row['ind_sat']), axis = 1)

In [None]:
x , y = genetics_df['indica'], genetics_df['thc_content']
plt.plot(x,y, 'ro')
z,w = genetics_df['sativa'], genetics_df['thc_content']
plt.plot(z,w, 'bo')

In [None]:
mask = (genetics_df.parent_1 != 'NA') & (genetics_df.parent_2 != 'NA')
twoparents_df = genetics_df[mask]

In [None]:
P0_1a = patsy.dmatrix('parent',
                    data = twoparents_df.rename(columns={'parent_1':'parent'}),
                    return_type = 'dataframe').dropna(axis=0)

P0_2a = patsy.dmatrix('parent',
                    data = twoparents_df.rename(columns={'parent_2':'parent'}),
                    return_type = 'dataframe').dropna(axis=0)
                      
columns = list(set(P0_1a.columns.tolist() + P0_2a.columns.tolist()))

P0a = pd.DataFrame()
for col in columns:
    P0a[col] = pd.Series([0]*P0_2a.shape[0])
    if col in P0_1a.columns:
        P0a[col] += P0_1a[col].values
    if col in P0_2a.columns:
        P0a[col] += P0_2a[col].values

In [None]:
encoded = pd.concat([twoparents_df, P0a], axis=1)
encoded = encoded.drop(['parent_1', 'parent_2'], axis=1, inplace=True)

In [None]:
lr = LinearRegression()
X = encoded.iloc[:, 5:]
y = encoded.iloc[:, 2]
lr.fit(X,y)
lr.score(X,y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

kf = KFold(n_splits=5, shuffle=True)
degree = 2 
est = make_pipeline(PolynomialFeatures(degree), LinearRegression())

In [None]:
est.fit(X,y)

In [None]:
kf = KFold(n_splits=5, shuffle=True)

degree=2
all_scores=[]
alphas=[1e-5,1e-3,1e-1,0,1,100]
l1_ratios = [0.1,0.9,0.5]

params = []
for a in alphas:
    for l1r in l1_ratios:
        params.append((a,l1r))

for train, test in kf.split(X):
    x_tr = X.iloc[train]
    y_tr = y.iloc[train]
    x_te = X.iloc[test]
    y_te = y.iloc[test]
    mse_score=[]
    for a,l1r in params:
        est=make_pipeline(PolynomialFeatures(degree), ElasticNet(alpha=a, l1_ratio=l1r))
        est.fit(x_tr,y_tr)
        mse=np.mean((y_te-est.predict(x_te))**2)
        mse_score.append(mse)
    all_scores.append(mse_score)

best_idx = np.argmin(all_scores)
best_score = all_scores[best_idx//len(params)][best_idx%len(params)]
best_params = params[best_idx % len(params)]
print ('best_idx = ', best_idx, ' best_params = ', best_params, '  best_score = ', best_score)
est=ElasticNet(alpha=best_params[0], l1_ratio=best_params[1])

In [None]:
plt.plot(ytest, est.predict(xtest), color='blue',
         marker='o',ls='')

plt.xticks(())
plt.yticks(())

plt.show()

In [None]:
est.fit(X_train,y_train)
zipped=zip(X.columns,est.coef_)
features=sorted(zipped,key = lambda t: abs(t[1]),reverse=True)
for f in features[:10]:
    print ("{:20.20s} {:7.3f}".format(f[0], f[1]))

In [None]:
x, y = genetics_df['parent_Pineapple_Kush'], genetics_df['thc_content']
plt.plot(x,y, 'ro')

In [None]:
x, y = genetics_df['parent_Fucking_Incredible'], genetics_df['thc_content']
plt.plot(x,y, 'ro')

In [None]:
x, y = genetics_df['parent_Chemdawg'], genetics_df['thc_content']
plt.plot(x,y, 'ro')

In [None]:
x, y = genetics_df['parent_Golden_Goat'], genetics_df['thc_content']
plt.plot(x,y, 'go')

In [None]:
x, y = genetics_df['parent_Tahoe_OG'], genetics_df['thc_content']
plt.plot(x,y, 'ro')