In [81]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)

In [83]:
# Get the scikit-learn data set:
from sklearn.datasets import fetch_20newsgroups

In [84]:
# Extract information from the data's dictionary format 

categories = ['alt.atheism',
'talk.religion.misc',
'comp.graphics',
'sci.space']

# Set training data
data_train = fetch_20newsgroups(subset='train', categories=categories,
                                shuffle=True, random_state=42,
                                remove=('headers', 'footers', 'quotes'))
# Set testing data
data_test = fetch_20newsgroups(subset='test', categories=categories,
                               shuffle=True, random_state=42,
                               remove=('headers', 'footers', 'quotes'))

In [85]:
# Inspect data

In [86]:
type(data_train)

sklearn.utils.Bunch

In [87]:
len(data_train.data)

2034

In [88]:
print(data_train.data[0])

Hi,

I've noticed that if you only save a model (with all your mapping planes
positioned carefully) to a .3DS file that when you reload it after restarting
3DS, they are given a default position and orientation.  But if you save
to a .PRJ file their positions/orientation are preserved.  Does anyone
know why this information is not stored in the .3DS file?  Nothing is
explicitly said in the manual about saving texture rules in the .PRJ file. 
I'd like to be able to read the texture rule information, does anyone have 
the format for the .PRJ file?

Is the .CEL file format available from somewhere?

Rych


In [89]:
# Create a bag-of-words model

from sklearn.feature_extraction.text import CountVectorizer

In [90]:
cvect = CountVectorizer()
X_train = data_train.data
cvect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [91]:
cvect.vocabulary_

{'hi': 12171,
 've': 25588,
 'noticed': 17138,
 'that': 24080,
 'if': 12693,
 'you': 26764,
 'only': 17493,
 'save': 21405,
 'model': 16239,
 'with': 26405,
 'all': 3042,
 'your': 26768,
 'mapping': 15466,
 'planes': 18636,
 'positioned': 18898,
 'carefully': 5443,
 'to': 24380,
 '3ds': 1152,
 'file': 10376,
 'when': 26235,
 'reload': 20439,
 'it': 13712,
 'after': 2853,
 'restarting': 20713,
 'they': 24155,
 'are': 3607,
 'given': 11310,
 'default': 7761,
 'position': 18896,
 'and': 3254,
 'orientation': 17648,
 'but': 5220,
 'prj': 19263,
 'their': 24092,
 'positions': 18901,
 'preserved': 19146,
 'does': 8620,
 'anyone': 3397,
 'know': 14291,
 'why': 26289,
 'this': 24191,
 'information': 13153,
 'is': 13651,
 'not': 17121,
 'stored': 23103,
 'in': 12920,
 'the': 24082,
 'nothing': 17134,
 'explicitly': 9935,
 'said': 21284,
 'manual': 15444,
 'about': 2427,
 'saving': 21409,
 'texture': 24057,
 'rules': 21177,
 'like': 14847,
 'be': 4326,
 'able': 2408,
 'read': 20041,
 'rule': 211

In [92]:
cvec = CountVectorizer(stop_words='english', min_df=5) # add stop words, min_df only includes counts of words x number of times
cvect.fit(X_train)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [93]:
len(cvect.vocabulary_)

26879

In [94]:
# Fit on a model

from sklearn.linear_model import LogisticRegression

In [95]:
# Create and configure model

clf = LogisticRegression()

In [96]:
X_train_dtm = cvect.transform(data_train.data) 
X_test_dtm = cvect.transform(data_test.data)

In [97]:
clf.fit(X_train_dtm, data_train.target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [98]:
clf.score(X_test_dtm, data_test.target)

0.7198817442719881

In [99]:
# Test out hashing and TF-IDF to improve score

from sklearn.feature_extraction.text import TfidfVectorizer

In [100]:
tfidf = TfidfVectorizer(stop_words='english')

In [101]:
tfidf.fit(data_train.data)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [102]:
X_train_T = tfidf.transform(data_train.data)
X_test_T = tfidf.transform(data_test.data)

In [103]:
lgr = LogisticRegression()
lgr.fit(X_train_T, data_train.target)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [104]:
lgr.score(X_test_T, data_test.target)

0.7479674796747967

In [105]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV

In [106]:
pipe = Pipeline([('tfidf', TfidfVectorizer()),('lgr', LogisticRegression())])

In [107]:
params = {'tfidf__max_features': [10,100,1000,5000,10000],
         'tfidf__min_df': [2,3,4,5,10]}

In [108]:
grid = GridSearchCV(pipe, param_grid=params)

In [109]:
grid.fit(data_train.data, data_train.target)

GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                    

In [110]:
grid.score(data_test.data, data_test.target)

0.7405764966740577

In [111]:
grid.best_estimator_.get_params

<bound method Pipeline.get_params of Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=5000,
                                 min_df=2, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('lgr',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    