# Gateway to Research Funder Model

In [None]:
%run ../notebook_preamble.ipy

## Load Data

In [None]:
import ast

In [None]:
gtr_tokenised_df = pd.read_csv(f'{data_path}/processed/gtr_tokenised.csv',
                              converters={'processed_documents': ast.literal_eval})

In [None]:
gtr_tokenised_df.head()

## Make Model

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import classification_report

### Train Test Split

In [None]:
df_train, df_test, y_train, y_test = train_test_split(
    gtr_tokenised_df, gtr_tokenised_df['leadFunder'],
    shuffle=True, test_size=0.2
)

In [None]:
df_train['id'].to_csv(f'{data_path}/processed/gtr_funder_ids_train.csv', index=False)
df_test['id'].to_csv(f'{data_path}/processed/gtr_funder_ids_test.csv', index=False)

### Estimator Pipeline

In [None]:
estimators = [
    ('tfidf', TfidfVectorizer()), 
    ('logr', LogisticRegression())
             ]

pipe = Pipeline(estimators)

params = {
    'tfidf__max_df': [0.8, 0.9, 1.0],
    'tfidf__min_df': [1, 2, 3],
    'logr__C': [0.1, 1, 10, 100],
}

clf = GridSearchCV(pipe, param_grid=params, cv=3, verbose=4, n_jobs=3)

In [None]:
x_train = df_train['processed_documents'].apply(lambda x: ' '.join(x))
x_test = df_test['processed_documents'].apply(lambda x: ' '.join(x))

In [None]:
clf.fit(x_train, y_train)

In [None]:
clf_best = clf.best_estimator_
y_pred = clf_best.predict(x_test)

print(classification_report(y_test, y_pred))

In [None]:
from yellowbrick.classifier import ConfusionMatrix, ClassificationReport

In [None]:
cr = ClassificationReport(clf_best)
cr.score(x_test, y_test)
cr.poof()

In [None]:
cm = ConfusionMatrix(clf_best)
cm.score(x_test, y_test)
cm.poof()

In [None]:
from sklearn.externals import joblib

In [None]:
joblib.dump(clf_best, f'{project_dir}/models/gtr_abstractText_leadFunder_model.pkl')