# Step 1 - TRAIN

In [1]:
import pandas as pd
import dill
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_auc_score, roc_curve, precision_recall_curve
from sklearn.metrics import f1_score

#working with text
from sklearn.feature_extraction.text import TfidfVectorizer

#normalizing data
from sklearn.preprocessing import StandardScaler

#pipeline
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.metrics import precision_score,recall_score

#imputer
from sklearn.impute import SimpleImputer

import sklearn.datasets

In [2]:
df = pd.read_csv('Spam Email raw text for NLP.csv')
df.head(3)

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,1,"Dear Homeowner,\n\n \n\nInterest Rates are at ...",00249.5f45607c1bffe89f60ba1ec9f878039a
1,1,ATTENTION: This is a MUST for ALL Computer Use...,00373.ebe8670ac56b04125c25100a36ab0510
2,1,This is a multi-part message in MIME format.\n...,00214.1367039e50dc6b7adb0f2aa8aba83216


In [3]:
df['CATEGORY'].value_counts()

0    3900
1    1896
Name: CATEGORY, dtype: int64

In [4]:
X_train, X_test, y_train, y_test = train_test_split(df, df['CATEGORY'],
                                                    test_size=0.33, random_state=42)
# save test
X_test.to_csv("X_test.csv", index=None)
y_test.to_csv("y_test.csv", index=None)

# save train
X_train.to_csv("X_train.csv", index=None)
y_train.to_csv("y_train.csv", index=None)

In [5]:
class ColumnSelector(BaseEstimator, TransformerMixin):
    """
    Transformer to select a single column from the data frame to perform additional transformations on
    """
    def __init__(self, key):
        self.key = key

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return X[self.key]
    

class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, key, value):
        self.key = key
        self.value = value
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X[self.key] = X[self.key].fillna(self.value)
        return X

In [6]:
features = ['MESSAGE', 'FILE_NAME']
target = 'CATEGORY'

In [7]:
# combine
MESSAGE = Pipeline([
                ('imputer', TextImputer('MESSAGE', '')),
                ('selector', ColumnSelector(key='MESSAGE')),
                ('tfidf', TfidfVectorizer())
            ])

FILE_NAME = Pipeline([
                ('imputer', TextImputer('FILE_NAME', '')),
                ('selector', ColumnSelector(key='FILE_NAME')),
                ('tfidf', TfidfVectorizer())
            ])


feats = FeatureUnion([('MESSAGE', MESSAGE),
                      ('FILE_NAME', FILE_NAME)])

In [8]:
%%time

pipeline = Pipeline([
    ('features', feats),
    ('classifier', LogisticRegression()),
])

pipeline.fit(X_train, y_train)

Wall time: 1.33 s


Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('MESSAGE',
                                                 Pipeline(steps=[('imputer',
                                                                  TextImputer(key='MESSAGE',
                                                                              value='')),
                                                                 ('selector',
                                                                  ColumnSelector(key='MESSAGE')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer())])),
                                                ('FILE_NAME',
                                                 Pipeline(steps=[('imputer',
                                                                  TextImputer(key='FILE_NAME',
                                                                             

In [9]:
pipeline.steps

[('features',
  FeatureUnion(transformer_list=[('MESSAGE',
                                  Pipeline(steps=[('imputer',
                                                   TextImputer(key='MESSAGE',
                                                               value='')),
                                                  ('selector',
                                                   ColumnSelector(key='MESSAGE')),
                                                  ('tfidf', TfidfVectorizer())])),
                                 ('FILE_NAME',
                                  Pipeline(steps=[('imputer',
                                                   TextImputer(key='FILE_NAME',
                                                               value='')),
                                                  ('selector',
                                                   ColumnSelector(key='FILE_NAME')),
                                                  ('tfidf',
                         

In [10]:
with open("logreg_pipeline.dill", "wb") as f:
    dill.dump(pipeline, f)

# Step 2 - PREDICT

In [11]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

In [12]:
X_test.head(3)

Unnamed: 0,CATEGORY,MESSAGE,FILE_NAME
0,0,"You could try HTTrack, available here: http://...",00420.f6140b71df992b02cc59548039eb05ca
1,0,"At 01:12 AM 8/24/02 -0700, Adam L. Beberg wrot...",00314.611159749e214b996589d557e335648e
2,0,Gary Lawrence Murphy wrote:\n\n>and say hello ...,00714.16c4d34ab2c9622fe82de9570946f9ef


In [13]:
with open('logreg_pipeline.dill', 'rb') as in_strm:
    pipeline = dill.load(in_strm)

In [14]:
pipeline

Pipeline(steps=[('features',
                 FeatureUnion(transformer_list=[('MESSAGE',
                                                 Pipeline(steps=[('imputer',
                                                                  TextImputer(key='MESSAGE',
                                                                              value='')),
                                                                 ('selector',
                                                                  ColumnSelector(key='MESSAGE')),
                                                                 ('tfidf',
                                                                  TfidfVectorizer())])),
                                                ('FILE_NAME',
                                                 Pipeline(steps=[('imputer',
                                                                  TextImputer(key='FILE_NAME',
                                                                             

In [15]:
preds = pipeline.predict_proba(X_test)[:, 1]

pred_df = pd.DataFrame({'preds': preds})
pred_df.to_csv("test_predictions.csv", index=None)

In [16]:
preds[:10]

array([0.03555344, 0.15388392, 0.09508508, 0.10314481, 0.06212979,
       0.99250233, 0.10237711, 0.60826906, 0.08018588, 0.67784208])

In [17]:
precision, recall, thresholds = precision_recall_curve(y_test, preds)

fscore = (2 * precision * recall) / (precision + recall)
# locate the index of the largest f score
ix = np.argmax(fscore)
print(f'Best Threshold={thresholds[ix]}, F-Score={fscore[ix]:.3f}, Precision={precision[ix]:.3f}, Recall={recall[ix]:.3f}')

Best Threshold=0.2579382453595056, F-Score=0.980, Precision=0.968, Recall=0.993


# Step 3 - FLASK

In [18]:
#!/Users/lmv/opt/anaconda3/envs/pytorch/bin/pip3 install flask-ngrok

In [19]:
#from flask_ngrok import run_with_ngrok
from flask import Flask, request, jsonify
import pandas as pd

# Пробный запуск Flask

app = Flask(__name__)
#run_with_ngrok(app)  # Start ngrok when app is run

@app.route("/a")
def hello():
    return "Hello World!"

if __name__ == '__main__':
    app.run()

In [20]:
import pandas as pd
import dill

In [21]:
# Загружаем обученные модели
with open('logreg_pipeline.dill', 'rb') as in_strm:
    model = dill.load(in_strm)

In [22]:
X_test = pd.read_csv("X_test.csv")
y_test = pd.read_csv("y_test.csv")

Запустить сервис и не глушить его, пока работаем

In [26]:
# Обработчики и запуск Flask
app = Flask(__name__)
#run_with_ngrok(app)  # Start ngrok when app is run


@app.route("/", methods=["GET"])
def general():
    return "Welcome to prediction process"

@app.route('/predict', methods=['POST'])
def predict():
    data = {"success": False}

    # ensure an image was properly uploaded to our endpoint
    MESSAGE, FILE_NAME = "", ""
    request_json = request.get_json()
    
    if request_json['MESSAGE']:
        MESSAGE = request_json['MESSAGE']
    
    if request_json['FILE_NAME']:
        FILE_NAME = request_json['FILE_NAME']
            
    print(MESSAGE)  
    preds = model.predict_proba(pd.DataFrame({'MESSAGE': [MESSAGE],
                                              'FILE_NAME': [FILE_NAME]}))
    data['predictions'] = preds[:, 1][0]
    data['MESSAGE'] = MESSAGE
        # indicate that the request was a success
    data["success"] = True
    print('OK')

        # return the data dictionary as a JSON response
    return jsonify(data)


if __name__ == '__main__':
    app.run()

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)
127.0.0.1 - - [08/Jul/2023 21:45:20] "POST //predict HTTP/1.1" 200 -


At 01:12 AM 8/24/02 -0700, Adam L. Beberg wrote:



>If the creator didnt say you could have it without paying, it's theft, so

>simple, hell that's even in all the major holy books.



In which world are we talking about?  That may be true for the first sale, 

but once something is out in the world, the "creator" loses control... If I 

buy a chair you built, and then decide to give it away to my neighbor, by 

you're definition, he just stole from you.



>Fair use needs to be clarified a bit and then I hope they start locking

>people up. How else do i ever have hope of finding a job working for someone

>that makes things people are supposed to ... *drumroll* pay for.



Why is it that people don't understand that giving stuff away is a 

perfectly acceptable tactic in capitalist businesses?  In many places, it's 

called "advertising": "buy one, get one free", "free shipping on any order 

over $25", "buy this couch, and get a coffee table for free", "free popcorn 

with any movi

127.0.0.1 - - [08/Jul/2023 21:46:32] "POST //predict HTTP/1.1" 200 -


At 01:12 AM 8/24/02 -0700, Adam L. Beberg wrote:



>If the creator didnt say you could have it without paying, it's theft, so

>simple, hell that's even in all the major holy books.



In which world are we talking about?  That may be true for the first sale, 

but once something is out in the world, the "creator" loses control... If I 

buy a chair you built, and then decide to give it away to my neighbor, by 

you're definition, he just stole from you.



>Fair use needs to be clarified a bit and then I hope they start locking

>people up. How else do i ever have hope of finding a job working for someone

>that makes things people are supposed to ... *drumroll* pay for.



Why is it that people don't understand that giving stuff away is a 

perfectly acceptable tactic in capitalist businesses?  In many places, it's 

called "advertising": "buy one, get one free", "free shipping on any order 

over $25", "buy this couch, and get a coffee table for free", "free popcorn 

with any movi

127.0.0.1 - - [08/Jul/2023 21:49:19] "POST //predict HTTP/1.1" 200 -


ATTENTION: This is a MUST for ALL Computer Use
OK


127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:2

You could try HTTrack, available here: http://www.httrack.com/index.php



It does recursive grabs of stuff, gets pictures etc.



Hello everyone, by the way.



-- 

Irish Linux Users' Group: ilug@linux.ie

http://www.linux.ie/mailman/listinfo/ilug for (un)subscription information.

List maintainer: listmaster@linux.ie



OK
At 01:12 AM 8/24/02 -0700, Adam L. Beberg wrote:



>If the creator didnt say you could have it without paying, it's theft, so

>simple, hell that's even in all the major holy books.



In which world are we talking about?  That may be true for the first sale, 

but once something is out in the world, the "creator" loses control... If I 

buy a chair you built, and then decide to give it away to my neighbor, by 

you're definition, he just stole from you.



>Fair use needs to be clarified a bit and then I hope they start locking

>people up. How else do i ever have hope of finding a job working for someone

>that makes things people are supposed to ... *drumroll*

127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:29] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:30] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:30] "POST //predict HTTP/1.1" 200 -
127.0.0.1 - - [08/Jul/2023 21:50:3

Long time no chat!



How have you been? If you've been like me, you've been trying

trying almost EVERYTHING to lose weight.  I know how you feel

- the special diets, miracle pills, and fancy exercise 

equipment never helped me lose the pounds I needed to lose

either.  It seemed like the harder I worked at it, the less

weight I lost - until I heard about 'Extreme Power Plus'.



You're probably thinking to yourself, "Oh geez, not another

miracle diet pill!"  Like you, I was skeptical at first, but 

my sister said it helped her lose 23 pounds in just 2 weeks, 

so I told her I'd give it a try.  I mean, there was nothing 

to lose except a lot of weight!  Let me tell you, it was

the best decision I've ever made.  PERIOD.  Six months later,

as I'm writing this message to you, I've gone from 355 pounds

to 210 pounds, and I haven't changed my exercise routine or diet

at all.  Yes, I still eat pizza, and lots of it!



I was so happy with the results that I contacted the manufactu