## Text Representation

In [24]:
corpus = ['text text mining is interesting',
         'text mining is like data mining',
         'text and data mining have few differences']

In [79]:
from sklearn.feature_extraction.text import CountVectorizer

In [122]:
# (binary = 'true') gives binary freq
# (max_df=2) is when we want to ignore stop words that occur in > 2 docs
# (min_df = 2) if word occurs in less than 2 docs then we don't want it; noise words
# (max_features = 4) gives me best 4 features
# ngram_range(2,2) gives bigrams like 'text text', 'text mining', 'mining is'
# ngram_range(2,3) gives bigrams and trigrams, min 2 max 3, like 'text text' 'text text mining'

vec = CountVectorizer()

In [123]:
X = vec.fit_transform(corpus)

In [124]:
# rows, cols
X

<3x10 sparse matrix of type '<class 'numpy.int64'>'
	with 16 stored elements in Compressed Sparse Row format>

In [125]:
# makes array with count of tokens
X.toarray()

array([[0, 0, 0, 0, 0, 1, 1, 0, 1, 2],
       [0, 1, 0, 0, 0, 0, 1, 1, 2, 1],
       [1, 1, 1, 1, 1, 0, 0, 0, 1, 1]])

In [126]:
# gives unique tokens
vec.get_feature_names()

['and',
 'data',
 'differences',
 'few',
 'have',
 'interesting',
 'is',
 'like',
 'mining',
 'text']

In [127]:
len(vec.get_feature_names())

10

In [128]:
# gives ids in array
vec.vocabulary_

{'text': 9,
 'mining': 8,
 'is': 6,
 'interesting': 5,
 'like': 7,
 'data': 1,
 'and': 0,
 'have': 4,
 'few': 3,
 'differences': 2}

In [129]:
########################################################

### TF-IDF Factorization

In [130]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [157]:
# min_df, max_df
vec = TfidfVectorizer(min_df=1, max_df=2, max_features = 20, ngram_range=(1,3))

In [158]:
X = vec.fit_transform(corpus)

In [159]:
X

<3x20 sparse matrix of type '<class 'numpy.float64'>'
	with 26 stored elements in Compressed Sparse Row format>

In [160]:
X.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.5       , 0.        , 0.        , 0.        , 0.        ,
        0.5       , 0.        , 0.        , 0.5       , 0.5       ],
       [0.        , 0.        , 0.32516555, 0.32516555, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.32516555, 0.42755362, 0.42755362, 0.        , 0.        ,
        0.32516555, 0.        , 0.        , 0.32516555, 0.32516555],
       [0.27569246, 0.27569246, 0.20967122, 0.20967122, 0.27569246,
        0.27569246, 0.27569246, 0.27569246, 0.27569246, 0.27569246,
        0.        , 0.        , 0.        , 0.27569246, 0.27569246,
        0.        , 0.27569246, 0.27569246, 0.        , 0.        ]])

In [161]:
vec.get_feature_names()

['and data',
 'and data mining',
 'data',
 'data mining',
 'data mining have',
 'differences',
 'few',
 'few differences',
 'have',
 'have few',
 'is',
 'is like data',
 'like data mining',
 'mining have',
 'mining have few',
 'mining is',
 'text and',
 'text and data',
 'text mining',
 'text mining is']

In [162]:
vec.vocabulary_

{'is': 10,
 'text mining': 18,
 'mining is': 15,
 'text mining is': 19,
 'data': 2,
 'data mining': 3,
 'is like data': 11,
 'like data mining': 12,
 'have': 8,
 'few': 6,
 'differences': 5,
 'text and': 16,
 'and data': 0,
 'mining have': 13,
 'have few': 9,
 'few differences': 7,
 'text and data': 17,
 'and data mining': 1,
 'data mining have': 4,
 'mining have few': 14}

In [163]:
#################################################################

In [192]:
corpus = open('dataset.txt').read()


In [193]:
docs = corpus.strip().split('\n')

In [211]:
# x for docs, y for labels
documents, labels = [], []

for doc in docs:
    i, l = doc.split(':')
    documents.append(i.strip())
    labels.append(l.strip())

In [210]:
vec = TfidfVectorizer()

In [213]:
x = vec.fit_transform(documents)

In [214]:
x

<6x41 sparse matrix of type '<class 'numpy.float64'>'
	with 51 stored elements in Compressed Sparse Row format>

In [215]:
x.toarray()

array([[0.        , 0.        , 0.        , 0.        , 0.3170431 ,
        0.        , 0.        , 0.        , 0.3170431 , 0.        ,
        0.        , 0.        , 0.        , 0.3170431 , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.38663118, 0.38663118, 0.        , 0.38663118,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.38663118, 0.        , 0.        , 0.3170431 ,
        0.        ],
       [0.        , 0.37393382, 0.        , 0.37393382, 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.30663108,
        0.37393382, 0.37393382, 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.     

In [216]:
vec.get_feature_names()

['and',
 'but',
 'care',
 'cook',
 'cooking',
 'demand',
 'diet',
 'fit',
 'for',
 'hardwork',
 'important',
 'in',
 'include',
 'is',
 'it',
 'like',
 'longer',
 'lot',
 'my',
 'need',
 'of',
 'one',
 'pastas',
 'playing',
 'priorities',
 'professional',
 'recipe',
 'snacks',
 'soup',
 'special',
 'sports',
 'sportsmen',
 'stay',
 'take',
 'takes',
 'their',
 'this',
 'to',
 'usually',
 'very',
 'while']

In [217]:
vec.vocabulary_

{'this': 36,
 'recipe': 26,
 'is': 13,
 'very': 39,
 'special': 29,
 'for': 8,
 'cooking': 4,
 'snacks': 27,
 'like': 15,
 'to': 37,
 'cook': 3,
 'but': 1,
 'it': 14,
 'usually': 38,
 'takes': 34,
 'longer': 16,
 'my': 18,
 'priorities': 24,
 'in': 11,
 'include': 12,
 'pastas': 22,
 'and': 0,
 'soup': 28,
 'one': 21,
 'need': 19,
 'stay': 32,
 'fit': 7,
 'while': 40,
 'playing': 23,
 'professional': 25,
 'sports': 30,
 'important': 10,
 'sportsmen': 31,
 'take': 33,
 'care': 2,
 'of': 20,
 'their': 35,
 'diet': 6,
 'demand': 5,
 'lot': 17,
 'hardwork': 9}

In [218]:
####################################################################

In [232]:
corpus = open('buttons_amazon_kindle.txt.data',encoding='utf-8',errors='ignore').read()

In [237]:
docs = corpus.split('\n')

In [238]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [239]:
vec = TfidfVectorizer()

In [240]:
x = vec.fit_transform(docs)

In [241]:
x

<166x788 sparse matrix of type '<class 'numpy.float64'>'
	with 2700 stored elements in Compressed Sparse Row format>

In [242]:
x.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.263758  , ..., 0.        , 0.29434293,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

## Assignment 

In [290]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [288]:
corpus = open('dataset2.csv', encoding='utf8',errors='ignore').read()

In [289]:
docs = corpus.split('\n')

In [291]:
vector = TfidfVectorizer()

In [292]:
info = vector.fit_transform(docs)

In [293]:
info

<653x2480 sparse matrix of type '<class 'numpy.float64'>'
	with 14783 stored elements in Compressed Sparse Row format>

In [294]:
vector = TfidfVectorizer(max_df = 200, min_df = 20)

In [295]:
info = vector.fit_transform(docs)

In [296]:
info

<653x126 sparse matrix of type '<class 'numpy.float64'>'
	with 6471 stored elements in Compressed Sparse Row format>

In [297]:
info.toarray()

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.41527158, ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])