In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import pandas as pd
import numpy as np
import re
from ast import literal_eval

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/siddharthnayak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
def read_data(filename):
    data = pd.read_csv(filename, sep='\t')
    data['tags'] = data['tags'].apply(literal_eval)
    return data

In [3]:
train=read_data('/Users/siddharthnayak/Downloads/natural-language-processing-master/data/train.tsv')
validation=read_data('/Users/siddharthnayak/Downloads/natural-language-processing-master/data/validation.tsv')
test=pd.read_csv('/Users/siddharthnayak/Downloads/natural-language-processing-master/data/test.tsv',sep='\t')


In [4]:
train.head()

Unnamed: 0,title,tags
0,How to draw a stacked dotplot in R?,[r]
1,mysql select all records where a datetime fiel...,"[php, mysql]"
2,How to terminate windows phone 8.1 app,[c#]
3,get current time in a specific country via jquery,"[javascript, jquery]"
4,Configuring Tomcat to Use SSL,[java]


In [5]:
train['title'].values

array(['How to draw a stacked dotplot in R?',
       'mysql select all records where a datetime field is less than a specified value',
       'How to terminate windows phone 8.1 app', ...,
       'Python Pandas Series of Datetimes to Seconds Since the Epoch',
       'jqGrid issue grouping - Duplicate rows get appended every time sort is changed',
       'Create a List of primitive int?'], dtype=object)

In [6]:
X_train, y_train = train['title'].values, train['tags'].values
X_val, y_val = validation['title'].values, validation['tags'].values
X_test = test['title'].values

In [7]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

In [8]:
def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
    text = text.lower()# lowercase text
    text =  REPLACE_BY_SPACE_RE.sub('',text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('',text)# delete symbols which are in BAD_SYMBOLS_RE from text
    sent = ''
    for word in text.split():
        if word not in STOPWORDS:
            if sent=='':
                sent=word
            else:    
                sent=sent+' '+word  # delete stopwords from text
    text=sent        
    return text

In [9]:
def test_text_prepare():
    examples = ["SQL Server - any equivalent of Excel's CHOOSE function?",
                "How to free c++ memory vector<int> * arr?"]
    answers = ["sql server equivalent excels choose function", 
               "free c++ memory vectorint arr"]
    for ex, ans in zip(examples, answers):
        if text_prepare(ex) != ans:
            return "Wrong answer for the case: '%s'" % ex
    return 'Basic tests are passed.'

In [10]:
X_train = [text_prepare(x) for x in X_train]
X_val = [text_prepare(x) for x in X_val]
X_test = [text_prepare(x) for x in X_test]

In [11]:
# Dictionary of all tags from train corpus with their counts.
tags_counts = {}
# Dictionary of all words from train corpus with their counts.
words_counts = {}
for words in X_train:
    for word in words.split():
        if word in words_counts:
            words_counts[word]+=1
        else:
            words_counts[word]=1
            
for words in train['tags']:
    for word in words:
        if word in tags_counts:
            tags_counts[word]+=1
        else:
            tags_counts[word]=1

In [12]:
most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:3]
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:3]
print(most_common_tags,most_common_words)

[('javascript', 19078), ('c#', 19077), ('java', 18661)] [('using', 8274), ('php', 5422), ('java', 5397)]


In [13]:
DICT_SIZE = 5000
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:DICT_SIZE]
#WORDS_TO_INDEX 
WORDS_TO_INDEX ={}
for i in range(DICT_SIZE):
    WORDS_TO_INDEX [most_common_words[i][0]]=i
       
    
#INDEX_TO_WORDS = ####### YOUR CODE HERE #######
ALL_WORDS = WORDS_TO_INDEX.keys()

def my_bag_of_words(text, words_to_index, dict_size):
    """
        text: a string
        dict_size: size of the dictionary
        
        return a vector which is a bag-of-words representation of 'text'
    """
    result_vector = np.zeros(dict_size)
    text=text.split() # get the words in the sentence
    for word in text:
        i=words_to_index.get(word)
        if i!=None:
            result_vector[i]=1
    return result_vector

In [14]:
from scipy import sparse as sp_sparse

In [15]:
#Compressed sparse matrix
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train])
X_val_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_val])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test])
print('X_train shape ', X_train_mybag.shape)
print('X_val shape ', X_val_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

X_train shape  (100000, 5000)
X_val shape  (30000, 5000)
X_test shape  (20000, 5000)


In [16]:
row = X_train_mybag[10].toarray()[0]
non_zero_elements_count =np.count_nonzero(row)
print(non_zero_elements_count)

7


In [17]:
from grader import Grader
grader = Grader()
grader.submit_tag('WordsTagsCount', '%s\n%s' % (','.join(tag for tag, _ in most_common_tags), 
                                                ','.join(word for word, _ in most_common_words)))
grader.submit_tag('BagOfWords', str(non_zero_elements_count))

Current answer for task WordsTagsCount is:
 javascript,c#,java
using,php,java,file,javascript,error,get,c#,python,string,array,data,value,jquery...
Current answer for task BagOfWords is:
 7...


# TF-IDF 

In [18]:
from sklearn.feature_extraction.text import TfidfVectorizer


In [19]:
def tfidf_features(X_train, X_val, X_test):
    """
        X_train, X_val, X_test — samples        
        return TF-IDF vectorized representation of each sample and vocabulary
    """
    # Create TF-IDF vectorizer with a proper parameters choice
    # Fit the vectorizer on the train set
    # Transform the train, test, and val sets and return the result
    
    
    tfidf_vectorizer = TfidfVectorizer(min_df=5,max_df=0.9,ngram_range=(1,2),token_pattern='(\S+)')
    features=tfidf_vectorizer.fit_transform(X_train)
    X_train=pd.DataFrame(features.todense(),columns=tfidf_vectorizer.get_feature_names())
   
    X_val=tfidf_vectorizer.transform(X_val)
    X_val=X_val.todense()
    #X_val=pd.DataFrame(features.todense(),columns=tfidf_vectorizer.get_feature_names())
    
    
    X_test=tfidf_vectorizer.transform(X_test)
    X_test=X_test.todense()
    #X_test=pd.DataFrame(features.todense(),columns=tfidf_vectorizer.get_feature_names())
    

    
    return X_train, X_val, X_test, tfidf_vectorizer.vocabulary_

In [20]:
X_train_tfidf, X_val_tfidf, X_test_tfidf, tfidf_vocab = tfidf_features(X_train, X_val, X_test)
tfidf_reversed_vocab = {i:word for word,i in tfidf_vocab.items()}

In [27]:
X_train_tfidf.shape,X_val_tfidf.shape,X_test_tfidf.shape

((100000, 17966), (30000, 17966), (20000, 17966))

In [21]:
for name, age in tfidf_reversed_vocab.items():    # for name, age in list.items():  (for Python 3.x)
    if age == 'c#':
        print (name)

1839


# Training

In [22]:
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn import multiclass
from sklearn import linear_model

In [23]:
mlb = MultiLabelBinarizer(classes=sorted(tags_counts.keys()))
y_train = mlb.fit_transform(y_train)
y_val = mlb.fit_transform(y_val)

In [24]:
def train_classifier(X_train, y_train):
    """
      X_train, y_train — training data
      
      return: trained classifier
    """
    
    logreg = linear_model.LogisticRegression(verbose=True)
    clf = multiclass.OneVsRestClassifier(logreg)
    fit=clf.fit(X_train,y_train)
    # Create and fit LogisticRegression wraped into OneVsRestClassifier.
    return fit

In [25]:
classifier_mybag = train_classifier(X_train_mybag, y_train)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

In [28]:
classifier_tfidf = train_classifier(X_train_tfidf, y_train)

[LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear][LibLinear

In [29]:
y_val_predicted_labels_mybag = classifier_mybag.predict(X_val_mybag)
y_val_predicted_scores_mybag = classifier_mybag.decision_function(X_val_mybag)

y_val_predicted_labels_tfidf = classifier_tfidf.predict(X_val_tfidf)
y_val_predicted_scores_tfidf = classifier_tfidf.decision_function(X_val_tfidf)

In [49]:
X_train_tfidf.shape

(100000, 17966)

In [30]:
y_val_pred_inversed = mlb.inverse_transform(y_val_predicted_labels_tfidf)
y_val_inversed = mlb.inverse_transform(y_val)
for i in range(3):
    print('Title:\t{}\nTrue labels:\t{}\nPredicted labels:\t{}\n\n'.format(
        X_val[i],
        ','.join(y_val_inversed[i]),
        ','.join(y_val_pred_inversed[i])
    ))

Title:	odbc_exec always fail
True labels:	php,sql
Predicted labels:	


Title:	access base classes variable within child class
True labels:	javascript
Predicted labels:	


Title:	contenttype applicationjson required rails
True labels:	ruby,ruby-on-rails
Predicted labels:	ruby-on-rails




# Evaluation

In [38]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.metrics import roc_auc_score 
from sklearn.metrics import average_precision_score
from sklearn.metrics import recall_score

In [51]:
def print_evaluation_scores(y_val, predicted):
    print(accuracy_score(y_val, predicted))
    print(f1_score(y_val, predicted, average=None))
    print(roc_auc_score(y_val, predicted))
    print(average_precision_score(y_val, predicted))
    print(recall_score(y_val, predicted,average=None))

In [52]:
print('Bag-of-words')
print_evaluation_scores(y_val, y_val_predicted_labels_mybag)
print('Tfidf')
print_evaluation_scores(y_val, y_val_predicted_labels_tfidf)

Bag-of-words
0.3563
[ 0.13936249  0.60021668  0.22988506  0.69887955  0.86522911  0.32820513
  0.48741007  0.53652835  0.44680851  0.62450593  0.76664345  0.68297272
  0.06451613  0.02484472  0.87272727  0.39634941  0.68548387  0.056
  0.23148148  0.34285714  0.83387622  0.20111732  0.69411765  0.66242038
  0.6618705   0.8516129   0.05853659  0.19941349  0.05031447  0.66055046
  0.71653543  0.72636816  0.32997602  0.37536657  0.15909091  0.52051282
  0.38863636  0.80754561  0.8326766   0.62407862  0.74161378  0.61488673
  0.81553398  0.688       0.37536657  0.3255814   0.0661157   0.54455446
  0.72316384  0.48251748  0.63989108  0.68478261  0.66666667  0.73516386
  0.09160305  0.80203046  0.44933921  0.82987552  0.13333333  0.17391304
  0.82785352  0.42424242  0.82025528  0.05369128  0.10752688  0.68669528
  0.79207921  0.71759891  0.48421053  0.53139217  0.76428175  0.09090909
  0.70689655  0.52972973  0.4950495   0.59701493  0.4640884   0.65
  0.53594771  0.25420561  0.32340426  0.26

In [56]:
from sklearn.metrics import roc_curve
%matplotlib inline

In [57]:
n_classes = len(tags_counts)
roc_curve(y_val, y_val_predicted_scores_mybag, n_classes)

ValueError: multilabel-indicator format is not supported