In [1]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score #Evaluate score by cross validation

In [2]:
import sklearn.linear_model as lm

In [3]:
np.__version__

'1.16.2'

In [None]:
#Download the data
#Uncomment this cell
#!kaggle competitions download -c stumbleupon

In [4]:
PATH='data'

In [5]:
train_df = pd.read_csv(f'{PATH}/train.tsv', sep='\t')

In [6]:
test_df = pd.read_csv(f'{PATH}/test.tsv', sep='\t')

In [7]:
len(train_df), len(test_df)

(7395, 3171)

In [8]:
train_df.columns

Index(['url', 'urlid', 'boilerplate', 'alchemy_category',
       'alchemy_category_score', 'avglinksize', 'commonlinkratio_1',
       'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4',
       'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio',
       'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news',
       'lengthyLinkDomain', 'linkwordscore', 'news_front_page',
       'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url',
       'parametrizedLinkRatio', 'spelling_errors_ratio', 'label'],
      dtype='object')

In [9]:
test_df.columns

Index(['url', 'urlid', 'boilerplate', 'alchemy_category',
       'alchemy_category_score', 'avglinksize', 'commonlinkratio_1',
       'commonlinkratio_2', 'commonlinkratio_3', 'commonlinkratio_4',
       'compression_ratio', 'embed_ratio', 'framebased', 'frameTagRatio',
       'hasDomainLink', 'html_ratio', 'image_ratio', 'is_news',
       'lengthyLinkDomain', 'linkwordscore', 'news_front_page',
       'non_markup_alphanum_characters', 'numberOfLinks', 'numwords_in_url',
       'parametrizedLinkRatio', 'spelling_errors_ratio'],
      dtype='object')

In [10]:
x = train_df.boilerplate.head(1)

In [11]:
import json
data = json.loads(x.values[0]);
data['title']

'IBM Sees Holographic Calls Air Breathing Batteries ibm sees holographic calls, air-breathing batteries'

In [12]:
data['url']

'bloomberg news 2010 12 23 ibm predicts holographic calls air breathing batteries by 2015 html'

In [13]:
x_train, y = train_df.boilerplate.values, list(train_df.label.values)

In [14]:
x_test = test_df.boilerplate.values

In [15]:
tfidf_vectorizer = TfidfVectorizer(min_df=3,
                                   max_features=None,
                                   strip_accents='unicode', 
                                   analyzer='word',
                                   token_pattern=r'\w{1,}', 
                                   ngram_range=(1, 2), # an ngram_range of (1, 2) means unigrams and bigrams 
                                   use_idf=1,
                                   smooth_idf=1,
                                   sublinear_tf=1) # Apply sublinear tf scaling, i.e. replace tf with 1 + log(tf)

In [16]:
X_all = list(x_train) + list(x_test)

In [17]:
tfidf_vectorizer.fit(X_all)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.float64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words=None, strip_accents='unicode', sublinear_tf=1,
        token_pattern='\\w{1,}', tokenizer=None, use_idf=1,
        vocabulary=None)

In [18]:
X_all_vec = tfidf_vectorizer.transform(X_all)

In [19]:
X = X_all_vec[:len(train_df)]

In [20]:
X_test = X_all_vec[len(train_df):]

In [25]:
rd = lm.LogisticRegression(penalty='l2', dual=True, tol=0.0001, C=1, 
                           fit_intercept=True, intercept_scaling=1.0,
                           class_weight=None, random_state=42
                          )

In [29]:
print(f"20 Fold CV Score: {np.mean(cross_val_score(rd, X, y, cv=20, scoring='roc_auc'))}")



20 Fold CV Score: 0.8771238382225308


In [30]:
print('training on full data')
rd.fit(X, y)

training on full data


LogisticRegression(C=1, class_weight=None, dual=True, fit_intercept=True,
          intercept_scaling=1.0, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [32]:
rd.predict_proba(X_test)

array([[0.13883603, 0.86116397],
       [0.79080926, 0.20919074],
       [0.632865  , 0.367135  ],
       ...,
       [0.22973301, 0.77026699],
       [0.27508717, 0.72491283],
       [0.17519601, 0.82480399]])

In [35]:
pred = rd.predict_proba(X_test)[:, 1];pred

array([0.86116397, 0.20919074, 0.367135  , ..., 0.77026699, 0.72491283,
       0.82480399])

In [33]:
test_df.head()

Unnamed: 0,url,urlid,boilerplate,alchemy_category,alchemy_category_score,avglinksize,commonlinkratio_1,commonlinkratio_2,commonlinkratio_3,commonlinkratio_4,...,image_ratio,is_news,lengthyLinkDomain,linkwordscore,news_front_page,non_markup_alphanum_characters,numberOfLinks,numwords_in_url,parametrizedLinkRatio,spelling_errors_ratio
0,http://www.lynnskitchenadventures.com/2009/04/...,5865,"{""title"":""Homemade Enchilada Sauce Lynn s Kitc...",recreation,0.443906,2.55814,0.389706,0.257353,0.044118,0.022059,...,0.199438,1,1,15,0,5643,136,3,0.242647,0.080597
1,http://lolpics.se/18552-stun-grenade-ar,782,"{""title"":""lolpics Stun grenade ar "",""body"":"" f...",culture_politics,0.135844,3.771429,0.461538,0.205128,0.051282,0.0,...,0.08,?,1,62,0,382,39,2,0.128205,0.176471
2,http://www.xcelerationfitness.com/treadmills.html,6962,"{""title"":""Treadmills "",""body"":"" treadmills, st...",?,?,2.269565,0.495726,0.384615,0.17094,0.17094,...,10.0,?,1,42,0,2420,117,1,0.581197,0.125
3,http://www.bloomberg.com/news/2012-02-06/syria...,7640,"{""title"":""Father s Tactics Used by Assad to Cr...",culture_politics,0.90259,2.52349,0.705502,0.346278,0.122977,0.090615,...,0.005964,1,1,41,0,5559,309,10,0.038835,0.063126
4,http://www.wired.com/gadgetlab/2011/12/stem-tu...,3589,"{""title"":""Stem Turns Lemons and Limes Into Jui...",science_technology,0.486363,1.848,0.470968,0.16129,0.032258,0.0,...,0.035714,1,0,34,0,2209,155,10,0.096774,0.065341


In [36]:
test_df['label'] = pred

In [43]:
test_df[['urlid', 'label']][:10]

Unnamed: 0,urlid,label
0,5865,0.861164
1,782,0.209191
2,6962,0.367135
3,7640,0.152227
4,3589,0.481974
5,6719,0.37093
6,3905,0.337508
7,9841,0.951243
8,7447,0.227502
9,4776,0.314005


In [40]:
test_df[['urlid', 'label']].to_csv('submission.csv', index=False)

In [42]:
!kaggle competitions submit -c stumbleupon -f submission.csv -m 'Baseline benchmark'

100%|███████████████████████████████████████| 74.9k/74.9k [00:00<00:00, 115kB/s]
Successfully submitted to StumbleUpon Evergreen Classification Challenge

Obtained an AUC score of 0.87835 using Abishek benchmark script