In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import csv
import string
import re

In [2]:
exchange = pd.read_csv('~/Datasets/stack_exchange.csv')
exchange.drop('Unnamed: 0', inplace=True, axis=1)

In [3]:
def remove_punctuation(text):
    exclude = set(string.punctuation)
    return "".join(ch for ch in text if ch not in exclude)

def cleanhtml(raw_html):
    cleanr = re.compile('<.*?>')
    cleantext = re.sub(cleanr, '', raw_html)
    cleantext = cleantext.replace('\n', '')
    return cleantext

exchange['all_text'] = exchange['title'] + " " + exchange['content']
exchange['all_text'] = exchange['all_text'].apply(lambda x: x.lower())
exchange['no_html'] = exchange['all_text'].apply(cleanhtml)
exchange['no_punctuation'] = exchange['no_html'].apply(remove_punctuation)

In [4]:
exchange.head()

Unnamed: 0,id,title,content,tags,subject,subject_code,all_text,no_html,no_punctuation
0,1,What is the criticality of the ribosome bindin...,"<p>In prokaryotic translation, how critical fo...",ribosome binding-sites translation synthetic-b...,biology,1,what is the criticality of the ribosome bindin...,what is the criticality of the ribosome bindin...,what is the criticality of the ribosome bindin...
1,2,How is RNAse contamination in RNA based experi...,<p>Does anyone have any suggestions to prevent...,rna biochemistry,biology,1,how is rnase contamination in rna based experi...,how is rnase contamination in rna based experi...,how is rnase contamination in rna based experi...
2,3,Are lymphocyte sizes clustered in two groups?,<p>Tortora writes in <em>Principles of Anatomy...,immunology cell-biology hematology,biology,1,are lymphocyte sizes clustered in two groups? ...,are lymphocyte sizes clustered in two groups? ...,are lymphocyte sizes clustered in two groups t...
3,4,How long does antibiotic-dosed LB maintain goo...,<p>Various people in our lab will prepare a li...,cell-culture,biology,1,how long does antibiotic-dosed lb maintain goo...,how long does antibiotic-dosed lb maintain goo...,how long does antibioticdosed lb maintain good...
4,5,Is exon order always preserved in splicing?,<p>Are there any cases in which the splicing m...,splicing mrna spliceosome introns exons,biology,1,is exon order always preserved in splicing? <p...,is exon order always preserved in splicing? ar...,is exon order always preserved in splicing are...


In [5]:
cv = CountVectorizer(stop_words='english', max_features=2, binary=True)


In [6]:
exchange_vectors = cv.fit_transform(exchange['no_punctuation']).todense()
exchange_vectors = pd.DataFrame(exchange_vectors, columns=cv.get_feature_names())

In [7]:
exchange_vectors.head()

Unnamed: 0,im,like
0,0,0
1,0,0
2,0,0
3,0,0
4,1,0


In [8]:
for i in exchange_vectors.columns:
    print i, exchange_vectors[i].sum()

im 15463
like 16983


In [9]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [10]:
tvec = TfidfVectorizer(stop_words='english')
tvec.fit(exchange['no_punctuation'])

TfidfVectorizer(analyzer=u'word', binary=False, decode_error=u'strict',
        dtype=<type 'numpy.int64'>, encoding=u'utf-8', input=u'content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), norm=u'l2', preprocessor=None, smooth_idf=True,
        stop_words='english', strip_accents=None, sublinear_tf=False,
        token_pattern=u'(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [11]:
df = pd.DataFrame(tvec.transform(exchange['no_punctuation']).todense(), columns=tvec.get_feature_names())

In [12]:
df.head()

Unnamed: 0,00,000,0000,00000,000000,0000000,00000000,000000000,0000000000,0000000000000,...,샤부샤부,카스테라,ﬁelds,ﬁnd,ﬁnished,ﬁrst,ﬁshed,ﬁtting,ﬁttings,ﬁxed
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [92]:
## gets best tfidf score, and puts it into a tuple with the word associated with it
def get_best_word(df, index):
    row = df.iloc[index, :]
    row_tuple = zip(row.index, row.values)
    return max(row_tuple, key=lambda x: x[1])

In [None]:
## makes a list of tuples of the best words and their scores
## this works, but it takes a while
list_of_tags = []
for i in range(len(df)):
    list_of_tags.append(get_best_word(df,i))

In [83]:
max_values = []
for i in range(0,len(df.iloc[:,0])):
    max_values.append(max(df.iloc[i,:]))
    print i

KeyboardInterrupt: 

In [91]:
row = df.iloc[1,:]
row.index

Index([u'00', u'000', u'0000', u'00000', u'000000', u'0000000', u'00000000',
       u'000000000', u'0000000000', u'0000000000000',
       ...
       u'샤부샤부', u'카스테라', u'ﬁelds', u'ﬁnd', u'ﬁnished', u'ﬁrst', u'ﬁshed',
       u'ﬁtting', u'ﬁttings', u'ﬁxed'],
      dtype='object', length=242358)

In [None]:
df['max_values'] = max_values

In [81]:
max(df.iloc[2,:])

0.51070867171527923

In [22]:
first_row = df.head(1)

In [39]:
test = df.iloc[0,:].transpose()

In [48]:
test.index

Index([u'00', u'000', u'0000', u'00000', u'000000', u'0000000', u'00000000',
       u'000000000', u'0000000000', u'0000000000000',
       ...
       u'샤부샤부', u'카스테라', u'ﬁelds', u'ﬁnd', u'ﬁnished', u'ﬁrst', u'ﬁshed',
       u'ﬁtting', u'ﬁttings', u'ﬁxed'],
      dtype='object', length=242358)

In [68]:
row = zip(test.index, test)

In [54]:
max(row)

0.54384058593665485

In [63]:
row_dict.values()[0]

0.0

In [61]:
row_dict.keys()[0]

u'6000lbs'

In [64]:
for i in range(len(row_dict)):
    maximum = max(row_dict.values())
    if row_dict.values()[i] == maximum:
        print row_dict.keys()[i]

KeyboardInterrupt: 

In [33]:
test[0]

0.0

In [23]:
first_row.shape

(1, 242358)

In [26]:
# first_row.describe()

In [24]:
for i in first_row.columns:
    if first_row[i].sum() > 0.54:
        print i

translation


In [28]:
df.iloc?

In [13]:
df.shape

(67721, 242358)

In [14]:
# df.sum()

In [15]:
exchange.shape

(67721, 9)

In [None]:
## syntax for tfdif

# tvec = TfidfVectorizer(stop_words='english')
# tvec.fit([spam, ham])

# df = pd.DataFrame(tvec.transform([spam, ham]).todense(), columns=tvec.get_feature_names(), index=['spam', 'ham'])