In [1]:
import pandas as pd, numpy as np

In [2]:
df = pd.read_excel('USA_SAAS_CB_10Dec16.xlsx')
df.shape

(2593, 20)

In [3]:
df = df.dropna(subset=['category_list'])
df.index = range(len(df))
df.shape

(2572, 20)

In [4]:
df['enterprise_software_present'] = df['category_list'].apply(lambda x:1 if x and 'enterprise software' in x else 0)
df['cloud'] = df['category_list'].apply(lambda x:1 if x and 'cloud' in x else 0)
df['crm'] = df['category_list'].apply(lambda x:1 if x and 'crm' in x else 0)
df['mobile'] = df['category_list'].apply(lambda x:1 if x and 'mobile' in x else 0)
df['analytics'] = df['category_list'].apply(lambda x:1 if x and 'analytics' in x else 0)
print df['enterprise_software_present'].value_counts()
print df['cloud'].value_counts()
print df['crm'].value_counts()
print df['mobile'].value_counts()
print df['analytics'].value_counts()

0    1933
1     639
Name: enterprise_software_present, dtype: int64
0    2218
1     354
Name: cloud, dtype: int64
0    2423
1     149
Name: crm, dtype: int64
0    2206
1     366
Name: mobile, dtype: int64
0    2248
1     324
Name: analytics, dtype: int64


In [5]:
df[['short_description','description']].head()

Unnamed: 0,short_description,description
0,"Zoho offers a suite of business, collaboration...","Founded in 1996, Zoho Corporation is the softw..."
1,GoFormz is helping customers to transform into...,GoFormz lets businesses capture data electroni...
2,Acumatica is a provider of cloud business mana...,Acumatica is a leading provider of cloud busin...
3,Marketing automation for tax and accounting pr...,ClientWhys is a Software as a Service (SaaS) c...
4,everbill enables startups and SMEs to create i...,With everbill startups and SMEs can easily cre...


In [6]:
# phrase generation
# not used because taking too much time to run
import nltk,re
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
import extract_phrases
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer

grammar = r"""
  NP1: {<JJ><NN.*>+}          # Chunk sequences of JJ, NN
  NP2: {<NN.*>+<JJ>}          # Chunk sequences of NN and JJ
  NP3: {<NN.*>+}                  #Noun phrases
  VP: {<VB.*><NN.*>+} # Chunk verbs and their arguments
  """
# phr_list = ['NP1','NP2','NP3','VP']
phr_list = ['NP1','NP2','VP']
tag_list = ['NN','NNS','NNP','NNPS','VB','VBD','VBG','VBN','VBP','VBZ']
stop_words = stopwords.words()+['http','https','goo','isnt']
cp = nltk.RegexpParser(grammar)
pe = extract_phrases.PhraseExtractor()
snowball_stemmer = SnowballStemmer('english')
wordnet_lemmatizer = WordNetLemmatizer()
reg_exp = re.compile('[^a-zA-Z ]',re.IGNORECASE)
def tokenizer(text,stem_type='lemmatize'):
    '''
    :param text:
    :param stem_type: type of stemming to be done
    :return:
    '''
    pos_tags = nltk.pos_tag(nltk.word_tokenize(text))
    phrs = pe.extract_phrase_treeinput(cp.parse(pos_tags),phr_list)
    if stem_type == 'stem':
        wrds = [snowball_stemmer.stem(i[0]) for i in pos_tags if i[1] in tag_list]
    elif stem_type == 'lemmatize':
        wrds = [wordnet_lemmatizer.lemmatize(i[0]) for i in pos_tags if i[1] in tag_list]
    else:
        wrds = [i[0] for i in pos_tags if i[1]in tag_list]
    wrds = [wrd for wrd in wrds if wrd not in stop_words]
    if stem_type == 'stem':
        phrs = ['_'.join([snowball_stemmer.stem(wrd) for wrd in nltk.word_tokenize(phr)]) for phr in phrs]
    elif stem_type == 'lemmatize':
        phrs = ['_'.join([wordnet_lemmatizer.lemmatize(wrd) for wrd in nltk.word_tokenize(phr)]) for phr in phrs]
    else:
        phrs = ['_'.join([wrd for wrd in nltk.word_tokenize(phr)]) for phr in phrs]
    wrds = [reg_exp.sub('',i) for i in wrds]
    return wrds+phrs

In [7]:
#df['short_description_clean'] = df['short_description'].fillna('').apply(lambda x: ' '.join(tokenizer(x,stem_type='lemmatize')))
#df['description_clean'] = df['description'].fillna('').apply(lambda x: ' '.join(tokenizer(x,stem_type='lemmatize')))

In [18]:
# train test split
msk = np.random.rand(len(df)) < 0.8
train = df[msk]
test = df[~msk]
train.index = range(len(train))
test.index = range(len(test))

tfidf_vectorizer_short_descr = TfidfVectorizer(max_df=0.8, min_df=0.01,stop_words='english')
X_short_descr_train = tfidf_vectorizer_short_descr.fit_transform(train['short_description'].fillna(''))
X_short_descr_test = tfidf_vectorizer_short_descr.transform(test['short_description'].fillna(''))
tfidf_vectorizer_descr = TfidfVectorizer(max_df=0.8, min_df=0.01,stop_words='english')
X_descr_train = tfidf_vectorizer_descr.fit_transform(train['description'].fillna(''))
X_descr_test = tfidf_vectorizer_descr.transform(test['description'].fillna(''))
X_short_descr_train.shape,X_descr_train.shape

((2064, 126), (2064, 1083))

In [19]:
y_train,y_test = train['enterprise_software_present'],test['enterprise_software_present']
y_train.value_counts()

0    1550
1     514
Name: enterprise_software_present, dtype: int64

In [20]:
#setting random 50% y_train as -1 (unlabelled)
msk1 = np.random.rand(len(y_train)) < 0.5
y_train[msk1] = -1
y_train.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


-1    1059
 0     768
 1     237
Name: enterprise_software_present, dtype: int64

In [21]:
from scipy.sparse import hstack
X_train_labelled = hstack([X_short_descr_train[~msk1,:],X_descr_train[~msk1,:]])
y_train_labelled = y_train[~msk1]
X_test = hstack([X_short_descr_test,X_descr_test])

In [22]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC,SVC
from sklearn_cotraining.classifiers import CoTrainingClassifier
import sklearn_cotraining.classifiers
reload(sklearn_cotraining.classifiers)

<module 'sklearn_cotraining.classifiers' from 'sklearn_cotraining/classifiers.pyc'>

In [24]:
print 'Logistic'
base_lr = LogisticRegression()
base_lr.fit(X_train_labelled, y_train_labelled)
y_pred = base_lr.predict(X_test)
print classification_report(y_test, y_pred)

print 'Logistic CoTraining'
lg_co_clf = sklearn_cotraining.classifiers.CoTrainingClassifier(LogisticRegression(),u=len(y_train)//10,k=1000)
lg_co_clf.fit(X_short_descr_train, X_descr_train, y_train)
y_pred = lg_co_clf.predict(X_short_descr_test, X_descr_test)
print classification_report(y_test, y_pred)

print 'SVM'
base_svm = SVC(kernel='linear')
base_svm.fit(X_train_labelled, y_train_labelled)
y_pred = base_lr.predict(X_test)
print classification_report(y_test, y_pred)

print 'SVM CoTraining'
svm_co_clf = sklearn_cotraining.classifiers.CoTrainingClassifier(SVC(kernel='linear',probability=True), 
                                                                 u=len(y_train)//10,k=1000)
svm_co_clf.fit(X_short_descr_train, X_descr_train, y_train)
y_pred = svm_co_clf.predict(X_short_descr_test, X_descr_test)
print classification_report(y_test, y_pred)

Logistic
             precision    recall  f1-score   support

          0       0.77      0.97      0.85       383
          1       0.48      0.10      0.16       125

avg / total       0.70      0.75      0.68       508

Logistic CoTraining
no of iterations took for fitting:194
             precision    recall  f1-score   support

          0       0.76      0.99      0.86       383
          1       0.60      0.05      0.09       125

avg / total       0.72      0.76      0.67       508

SVM




             precision    recall  f1-score   support

          0       0.77      0.97      0.85       383
          1       0.48      0.10      0.16       125

avg / total       0.70      0.75      0.68       508

SVM CoTraining
no of iterations took for fitting:164
             precision    recall  f1-score   support

          0       0.77      0.95      0.85       383
          1       0.51      0.15      0.23       125

avg / total       0.71      0.76      0.70       508



## 80% unlabelled

In [63]:
train = df[msk]
test = df[~msk]
train.index = range(len(train))
test.index = range(len(test))
y_train,y_test = train['enterprise_software_present'],test['enterprise_software_present']
y_train.value_counts()

0    1563
1     507
Name: enterprise_software_present, dtype: int64

In [64]:
#setting random 80% y_train as -1 (unlabelled)
msk1 = np.random.rand(len(y_train)) < 0.8
y_train[msk1] = -1
X_train_labelled = hstack([X_short_descr_train[~msk1,:],X_descr_train[~msk1,:]])
y_train_labelled = y_train[~msk1]
X_test = hstack([X_short_descr_test,X_descr_test])
y_train.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


-1    1650
 0     310
 1     110
Name: enterprise_software_present, dtype: int64

In [67]:
print 'Logistic'
base_lr = LogisticRegression()
base_lr.fit(X_train_labelled, y_train_labelled)
y_pred = base_lr.predict(X_test)
print classification_report(y_test, y_pred)

print 'Logistic CoTraining'
lg_co_clf = sklearn_cotraining.classifiers.CoTrainingClassifier(LogisticRegression(),u=len(y_train)//10,k=1000)
lg_co_clf.fit(X_short_descr_train, X_descr_train, y_train)
y_pred = lg_co_clf.predict(X_short_descr_test, X_descr_test)
print classification_report(y_test, y_pred)

print 'SVM'
base_svm = LinearSVC()
base_svm.fit(X_train_labelled, y_train_labelled)
y_pred = base_lr.predict(X_test)
print classification_report(y_test, y_pred)

print 'SVM CoTraining'
svm_co_clf = sklearn_cotraining.classifiers.CoTrainingClassifier(LinearSVC(), u=len(y_train)//10,k=1000)
svm_co_clf.fit(X_short_descr_train, X_descr_train, y_train)
y_pred = svm_co_clf.predict(X_short_descr_test, X_descr_test)
print classification_report(y_test, y_pred)

Logistic
             precision    recall  f1-score   support

          0       0.75      0.99      0.85       370
          1       0.70      0.05      0.10       132

avg / total       0.73      0.75      0.65       502

Logistic CoTraining
no of iterations took for fitting:331
             precision    recall  f1-score   support

          0       0.74      1.00      0.85       370
          1       0.00      0.00      0.00       132

avg / total       0.54      0.74      0.63       502

SVM
             precision    recall  f1-score   support

          0       0.75      0.99      0.85       370
          1       0.70      0.05      0.10       132

avg / total       0.73      0.75      0.65       502

SVM CoTraining




no of iterations took for fitting:66
             precision    recall  f1-score   support

          0       0.74      1.00      0.85       370
          1       1.00      0.01      0.02       132

avg / total       0.81      0.74      0.63       502



## 90% unlabelled

In [32]:
train = df[msk]
test = df[~msk]
train.index = range(len(train))
test.index = range(len(test))
y_train,y_test = train['enterprise_software_present'],test['enterprise_software_present']
y_train.value_counts()

0    1550
1     514
Name: enterprise_software_present, dtype: int64

In [33]:
#setting random 90% y_train as -1 (unlabelled)
msk1 = np.random.rand(len(y_train)) < 0.9
y_train[msk1] = -1
X_train_labelled = hstack([X_short_descr_train[~msk1,:],X_descr_train[~msk1,:]])
y_train_labelled = y_train[~msk1]
X_test = hstack([X_short_descr_test,X_descr_test])
y_train.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()


-1    1864
 0     143
 1      57
Name: enterprise_software_present, dtype: int64

In [34]:
print 'Logistic'
base_lr = LogisticRegression()
base_lr.fit(X_train_labelled, y_train_labelled)
y_pred = base_lr.predict(X_test)
print classification_report(y_test, y_pred)

print 'Logistic CoTraining'
lg_co_clf = sklearn_cotraining.classifiers.CoTrainingClassifier(LogisticRegression(), u=len(y_train)//10,k=1000)
lg_co_clf.fit(X_short_descr_train, X_descr_train, y_train)
y_pred = lg_co_clf.predict(X_short_descr_test, X_descr_test)
print classification_report(y_test, y_pred)

print 'SVM'
base_svm = SVC(kernel='linear')
base_svm.fit(X_train_labelled, y_train_labelled)
y_pred = base_lr.predict(X_test)
print classification_report(y_test, y_pred)

print 'SVM CoTraining'
svm_co_clf = sklearn_cotraining.classifiers.CoTrainingClassifier(SVC(kernel='linear',probability=True),
                                                                 u=len(y_train)//10, k=1000)
svm_co_clf.fit(X_short_descr_train, X_descr_train, y_train)
y_pred = svm_co_clf.predict(X_short_descr_test, X_descr_test)
print classification_report(y_test, y_pred)


Logistic
             precision    recall  f1-score   support

          0       0.77      0.99      0.87       383
          1       0.82      0.07      0.13       125

avg / total       0.78      0.77      0.69       508

Logistic CoTraining
no of iterations took for fitting:374
             precision    recall  f1-score   support

          0       0.75      1.00      0.86       383
          1       0.00      0.00      0.00       125

avg / total       0.57      0.75      0.65       508

SVM
             precision    recall  f1-score   support

          0       0.77      0.99      0.87       383
          1       0.82      0.07      0.13       125

avg / total       0.78      0.77      0.69       508

SVM CoTraining




no of iterations took for fitting:267
             precision    recall  f1-score   support

          0       0.77      0.84      0.80       383
          1       0.33      0.24      0.28       125

avg / total       0.66      0.69      0.67       508



## add website data

In [8]:
import pandas as pd, numpy as np

In [9]:
df = pd.read_excel('USA_SAAS_CB_10Dec16.xlsx')
df.shape

(2593, 20)

In [10]:
df = df.dropna(subset=['category_list'])
df.index = range(len(df))
df.shape

(2572, 20)

In [11]:
df['enterprise_software_present'] = df['category_list'].apply(lambda x:1 if x and 'enterprise software' in x else 0)
df['enterprise_software_present'].value_counts()

0    1933
1     639
Name: enterprise_software_present, dtype: int64

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import nltk,re
from scipy.sparse import hstack

In [13]:
df1 = pd.read_csv('USA_SAAS_CB_10Dec16_website_text.csv')
df1['website_text'] = df1['website_text'].apply(lambda x: re.sub('\n',' ',x))
df1.shape

(2056, 2)

In [14]:
import tldextract
df1['domain_cleaned'] = df1['website'].apply(lambda x: tldextract.extract(x.lower()).domain)
df1.tail()

No handlers could be found for logger "tldextract"


Unnamed: 0,website,website_text,domain_cleaned
2051,http://www.cisco.com/web/about/ac49/ac0/ac1/ac...,Cisco Announces Acquisition of ThinkSmart Tech...,cisco
2052,http://macheen.com,www.macheen.com,macheen
2053,http://www.rtpholdings.com,RTP Holdings - RTP Holdings Home Solution Bene...,rtpholdings
2054,http://seatserve.com,SeatServe - Stay Seated. In-Seat Delivery at Y...,seatserve
2055,http://www.contentlaunch.com/,Content Marketing Software | Content Writing S...,contentlaunch


In [15]:
df1 = df1.drop_duplicates('domain_cleaned')
df1.shape

(1906, 3)

In [16]:
df['domain_cleaned'] = df['homepage_url'].apply(lambda x: tldextract.extract(x.lower()).domain)
df = df.drop_duplicates('domain_cleaned')
df.shape

(2391, 22)

In [17]:
df_final = pd.merge(df,df1,on='domain_cleaned')
df.shape,df1.shape,df_final.shape

((2391, 22), (1906, 3), (1893, 24))

In [18]:
df_final.tail()

Unnamed: 0,li_cpy_name,country_code,state_code,region,city,category_list,category_group_list,industry,short_description,description,...,funding_total_usd,last_funding_on,homepage_url,linkedin_url,domain,email,enterprise_software_present,domain_cleaned,website,website_text
1888,NuORDER,USA,CA,Los Angeles,West Hollywood,b2b|e-commerce|fashion|internet|saas|wholesale,commerce and shopping|design|internet services,Wholesale,NuORDER is a cloud & mobile B2B eCommerce plat...,NuORDER empowers B2B eCommerce sites for over ...,...,13900000.0,2015-02-01 00:00:00,http://www.nuorder.com,http://www.linkedin.com/company/2779405,nuorder.com,info@nuorder.com,0,nuorder,http://www.nuorder.com,NuORDER #1 B2B eCommerce Solution BRANDS RETAI...
1889,EvoNexus (CommNexus),USA,CA,San Diego,La Jolla,big data|cyber security|hardware|life science|...,biotechnology|data and analytics|hardware|info...,Wireless,EvoNexus is a non-profit technology incubator ...,"EvoNexus (formerly CommNexus, formerly the Tel...",...,,,http://www.evonexus.org,http://www.linkedin.com/company/commnexus,evonexus.org,,0,evonexus,http://www.evonexus.org,EvoNexus Jobs Contact Us Resources About About...
1890,Macheen Inc,USA,TX,Austin,Austin,enterprise software|internet|saas,internet services|software,Wireless,Macheen is a SaaS-based platform that brings m...,"Macheen, Inc is a SaaS company that brings mob...",...,34394995.0,2013-12-01 00:00:00,http://macheen.com,https://www.linkedin.com/company/macheen-inc,macheen.com,info@macheen.com,1,macheen,http://macheen.com,www.macheen.com
1891,RTP Holdings,USA,PA,Philadelphia,Radnor,saas,,Wireless,"Indoor location services, strategy, solutions....","Indoor GPS makes everyday activities faster, e...",...,,,http://www.rtpholdings.com,http://www.linkedin.com/company/rtp-holdings,rtpholdings.com,,0,rtpholdings,http://www.rtpholdings.com,RTP Holdings - RTP Holdings Home Solution Bene...
1892,SeatServe,USA,NY,New York City,New York,e-commerce|mobile|sports,commerce and shopping|mobile|sports,Wireless,A real-time delivery SaaS solution designed fo...,SeatServe is a digital solutions provider base...,...,,,http://seatserve.com,https://www.linkedin.com/company/seatserve,seatserve.com,info@seatserve.com,0,seatserve,http://seatserve.com,SeatServe - Stay Seated. In-Seat Delivery at Y...


### cotraining test

In [19]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC,SVC
from sklearn_cotraining.classifiers import CoTrainingClassifier
import sklearn_cotraining.classifiers
reload(sklearn_cotraining.classifiers)

<module 'sklearn_cotraining.classifiers' from 'sklearn_cotraining/classifiers.pyc'>

In [46]:
# train test split
np.random.seed(5)
msk = np.random.rand(len(df_final)) < 0.8
train = df_final[msk]
test = df_final[~msk]
train.index = range(len(train))
test.index = range(len(test))

tfidf_vectorizer_descr = TfidfVectorizer(max_df=0.8, min_df=0.01,stop_words='english',ngram_range=(1,2))
X_descr_train = tfidf_vectorizer_descr.fit_transform(train['description'].fillna(''))
X_descr_test = tfidf_vectorizer_descr.transform(test['description'].fillna(''))

tfidf_vectorizer_website_text = TfidfVectorizer(max_df=0.8, min_df=0.01,stop_words='english',ngram_range=(1,2))
X_website_text_train = tfidf_vectorizer_website_text.fit_transform(train['website_text'].fillna(''))
X_website_text_test = tfidf_vectorizer_website_text.transform(test['website_text'].fillna(''))

X_descr_train.shape,X_website_text_train.shape

((1506, 1199), (1506, 7435))

### 70% unlabelled

In [49]:
train = df_final[msk]
test = df_final[~msk]
train.index = range(len(train))
test.index = range(len(test))
y_train,y_test = train['enterprise_software_present'],test['enterprise_software_present']
y_train_copy = y_train.copy()
y_train.value_counts()

0    1117
1     389
Name: enterprise_software_present, dtype: int64

In [50]:
#setting random 70% y_train as -1 (unlabelled)
np.random.seed(5)
msk1 = np.random.rand(len(y_train)) < 0.7
y_train[msk1] = -1
X_train_labelled = hstack([X_descr_train[~msk1,:],X_website_text_train[~msk1,:]])
y_train_labelled = y_train[~msk1]
X_test = hstack([X_descr_test,X_website_text_test])
y_train.value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


-1    1051
 0     335
 1     120
Name: enterprise_software_present, dtype: int64

In [51]:
print 'Logistic'
base_lr = LogisticRegression(class_weight='balanced')
base_lr.fit(X_train_labelled, y_train_labelled)
print('    training set')
y_pred = base_lr.predict(X_train_labelled)
print classification_report(y_train[~msk1], y_pred)
print('    training set unlabelled')
y_pred = base_lr.predict(hstack([X_descr_train[msk1,:],X_website_text_train[msk1,:]]))
print classification_report(y_train_copy[msk1], y_pred)
print('    testing set')
y_pred = base_lr.predict(X_test)
print classification_report(y_test, y_pred)

print '\n\nLogistic CoTraining'
lg_co_clf = sklearn_cotraining.classifiers.CoTrainingClassifier(LogisticRegression(class_weight='balanced'), 
                                                                u=len(y_train)//10,k=1000)
lg_co_clf.fit(X_website_text_train, X_descr_train, y_train)
print('    training set')
y_pred = lg_co_clf.predict(X_website_text_train[~msk1,:], X_descr_train[~msk1,:])
print classification_report(y_train_copy[~msk1], y_pred)
print('    training set unlabelled')
y_pred = lg_co_clf.predict(X_website_text_train[msk1,:], X_descr_train[msk1,:])
print classification_report(y_train_copy[msk1], y_pred)
print('    testing set')
y_pred = lg_co_clf.predict(X_website_text_test, X_descr_test)
print classification_report(y_test, y_pred)

print '\n\nnaive bayes'
clf = MultinomialNB(fit_prior=False)
clf.fit(X_train_labelled, y_train_labelled)
print('    training set')
y_pred = clf.predict(X_train_labelled)
print classification_report(y_train[~msk1], y_pred)
print('    training set unlabelled')
y_pred = clf.predict(hstack([X_descr_train[msk1,:],X_website_text_train[msk1,:]]))
print classification_report(y_train_copy[msk1], y_pred)
print('    testing set')
y_pred = clf.predict(X_test)
print classification_report(y_test, y_pred)

print '\n\nnaive bayes cotraining'
clf = sklearn_cotraining.classifiers.CoTrainingClassifier(MultinomialNB(fit_prior=False), 
                                                                u=len(y_train)//10,k=1000)
clf.fit(X_website_text_train, X_descr_train, y_train)
print('    training set')
y_pred = clf.predict(X_website_text_train[~msk1,:], X_descr_train[~msk1,:])
print classification_report(y_train_copy[~msk1], y_pred)
print('    training set unlabelled')
y_pred = clf.predict(X_website_text_train[msk1,:], X_descr_train[msk1,:])
print classification_report(y_train_copy[msk1], y_pred)
print('    testing set')
y_pred = clf.predict(X_website_text_test, X_descr_test)
print classification_report(y_test, y_pred)

Logistic
    training set
             precision    recall  f1-score   support

          0       1.00      0.98      0.99       335
          1       0.94      1.00      0.97       120

avg / total       0.98      0.98      0.98       455

    training set unlabelled
             precision    recall  f1-score   support

          0       0.76      0.82      0.79       782
          1       0.34      0.26      0.30       269

avg / total       0.66      0.68      0.67      1051

    testing set
             precision    recall  f1-score   support

          0       0.81      0.84      0.82       302
          1       0.35      0.31      0.33        85

avg / total       0.71      0.72      0.71       387



Logistic CoTraining
no of iterations took for fitting:160
    training set
             precision    recall  f1-score   support

          0       0.91      0.82      0.86       335
          1       0.61      0.77      0.68       120

avg / total       0.83      0.81      0.81     



             precision    recall  f1-score   support

          0       0.74      1.00      0.85       782
          1       0.33      0.00      0.01       269

avg / total       0.64      0.74      0.64      1051

    testing set
             precision    recall  f1-score   support

          0       0.78      0.99      0.87       302
          1       0.25      0.01      0.02        85

avg / total       0.66      0.78      0.69       387



naive bayes cotraining
no of iterations took for fitting:173
    training set
             precision    recall  f1-score   support

          0       0.82      0.84      0.83       335
          1       0.52      0.47      0.49       120

avg / total       0.74      0.75      0.74       455

    training set unlabelled
             precision    recall  f1-score   support

          0       0.77      0.80      0.78       782
          1       0.33      0.29      0.31       269

avg / total       0.65      0.67      0.66      1051

    testing set




In [47]:
print 'Logistic website train'
base_lr = LogisticRegression(class_weight='balanced')
base_lr.fit(X_website_text_train, y_train_copy)
y_pred = base_lr.predict(X_website_text_train)
print classification_report(y_train_copy, y_pred)
print 'Logistic website test'
y_pred = base_lr.predict(X_website_text_test)
print classification_report(y_test, y_pred)

print '\n\nLogistic description train'
base_lr = LogisticRegression(class_weight='balanced')
base_lr.fit(X_descr_train, y_train_copy)
y_pred = base_lr.predict(X_descr_train)
print classification_report(y_train_copy, y_pred)
print 'Logistic description test'
y_pred = base_lr.predict(X_descr_test)
print classification_report(y_test, y_pred)


Logistic website train
             precision    recall  f1-score   support

          0       0.97      0.90      0.93      1117
          1       0.75      0.91      0.82       389

avg / total       0.91      0.90      0.90      1506

Logistic website test
             precision    recall  f1-score   support

          0       0.81      0.77      0.79       302
          1       0.30      0.34      0.32        85

avg / total       0.69      0.68      0.68       387



Logistic description train
             precision    recall  f1-score   support

          0       0.94      0.84      0.89      1117
          1       0.65      0.85      0.74       389

avg / total       0.87      0.85      0.85      1506

Logistic description test
             precision    recall  f1-score   support

          0       0.82      0.73      0.77       302
          1       0.31      0.42      0.36        85

avg / total       0.71      0.66      0.68       387



In [44]:
print 'svm website train'
base_svm = SVC(kernel='linear',class_weight='balanced')
base_svm.fit(X_website_text_train, y_train_copy)
y_pred = base_svm.predict(X_website_text_train)
print classification_report(y_train_copy, y_pred)
print 'svm website test'
y_pred = base_svm.predict(X_website_text_test)
print classification_report(y_test, y_pred)

print '\n\nsvm description train'
base_svm = SVC(kernel='linear',class_weight='balanced')
base_svm.fit(X_descr_train, y_train_copy)
y_pred = base_svm.predict(X_descr_train)
print classification_report(y_train_copy, y_pred)
print 'svm description test'
y_pred = base_svm.predict(X_descr_test)
print classification_report(y_test, y_pred)


svm website train
             precision    recall  f1-score   support

          0       0.93      0.78      0.85      1117
          1       0.57      0.82      0.67       389

avg / total       0.83      0.79      0.80      1506

svm website test
             precision    recall  f1-score   support

          0       0.79      0.63      0.70       302
          1       0.23      0.40      0.29        85

avg / total       0.67      0.58      0.61       387



svm description train
             precision    recall  f1-score   support

          0       0.83      0.64      0.72      1117
          1       0.38      0.64      0.48       389

avg / total       0.72      0.64      0.66      1506

svm description test
             precision    recall  f1-score   support

          0       0.83      0.62      0.71       302
          1       0.29      0.55      0.38        85

avg / total       0.71      0.60      0.64       387



In [164]:
print 'naive bayes website train'
clf = MultinomialNB(fit_prior=False)
clf.fit(X_website_text_train, y_train_copy)
y_pred = clf.predict(X_website_text_train)
print classification_report(y_train_copy, y_pred)
print 'naive bayes website test'
y_pred = clf.predict(X_website_text_test)
print classification_report(y_test, y_pred)

print '\n\nnaive bayes description train'
clf = MultinomialNB(fit_prior=False)
clf.fit(X_descr_train, y_train_copy)
y_pred = clf.predict(X_descr_train)
print classification_report(y_train_copy, y_pred)
print 'naive bayes description test'
y_pred = clf.predict(X_descr_test)
print classification_report(y_test, y_pred)


naive bayes website train
             precision    recall  f1-score   support

          0       0.86      0.81      0.83      1117
          1       0.53      0.62      0.58       389

avg / total       0.78      0.76      0.77      1506

naive bayes website test
             precision    recall  f1-score   support

          0       0.81      0.72      0.76       302
          1       0.29      0.41      0.34        85

avg / total       0.70      0.65      0.67       387



naive bayes description train
             precision    recall  f1-score   support

          0       0.91      0.82      0.86      1117
          1       0.60      0.76      0.67       389

avg / total       0.83      0.81      0.81      1506

naive bayes description test
             precision    recall  f1-score   support

          0       0.81      0.71      0.75       302
          1       0.28      0.40      0.33        85

avg / total       0.69      0.64      0.66       387



In [62]:
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler(with_mean=False)  
scaler.fit(X_website_text_train)
X_website_text_train = scaler.transform(X_website_text_train)
X_website_text_test = scaler.transform(X_website_text_test)
print 'neural_network website train'
clf = MLPClassifier(hidden_layer_sizes=(2),alpha=0.3)
clf.fit(X_website_text_train, y_train_copy)
y_pred = clf.predict(X_website_text_train)
print classification_report(y_train_copy, y_pred)
print 'neural_network website test'
y_pred = clf.predict(X_website_text_test)
print classification_report(y_test, y_pred)

scaler = StandardScaler(with_mean=False)  
scaler.fit(X_descr_train)
X_descr_train = scaler.transform(X_descr_train)
X_descr_test = scaler.transform(X_descr_test)
print '\n\nneural_network description train'
clf = MLPClassifier(hidden_layer_sizes=(2),alpha=0.3)
clf.fit(X_descr_train, y_train_copy)
y_pred = clf.predict(X_descr_train)
print classification_report(y_train_copy, y_pred)
print 'neural_network description test'
y_pred = clf.predict(X_descr_test)
print classification_report(y_test, y_pred)

neural_network website train
             precision    recall  f1-score   support

          0       1.00      0.98      0.99      1117
          1       0.95      0.99      0.97       389

avg / total       0.98      0.98      0.98      1506

neural_network website test
             precision    recall  f1-score   support

          0       0.79      0.84      0.81       302
          1       0.25      0.19      0.22        85

avg / total       0.67      0.70      0.68       387



neural_network description train
             precision    recall  f1-score   support

          0       0.93      1.00      0.97      1117
          1       1.00      0.79      0.88       389

avg / total       0.95      0.95      0.94      1506

neural_network description test
             precision    recall  f1-score   support

          0       0.79      0.72      0.75       302
          1       0.23      0.31      0.27        85

avg / total       0.66      0.63      0.64       387



In [53]:
y_train_copy.shape,X_website_text_train.shape,X_descr_train.shape

((1506,), (1506, 7435), (1506, 1199))