In [245]:
import pandas as pd
import spacy
import random
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

nlp = spacy.load("en_core_web_sm")

# EDA

In [189]:
df = pd.read_csv('../data/dataset.csv', delimiter=';')
df.head()

Unnamed: 0,description,fraudulent
0,"Food52, a fast-growing, James Beard Award-winn...",0
1,Organised - Focused - Vibrant - Awesome!Do you...,0
2,"Our client, located in Houston, is actively se...",0
3,THE COMPANY: ESRI – Environmental Systems Rese...,0
4,JOB TITLE: Itemization Review ManagerLOCATION:...,0


In [190]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17880 entries, 0 to 17879
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  17879 non-null  object
 1   fraudulent   17880 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 279.5+ KB


In [88]:
df['fraudulent'].value_counts()

fraudulent
0    17014
1      866
Name: count, dtype: int64

In [89]:
df['fraudulent'].value_counts(normalize=True)

fraudulent
0    0.951566
1    0.048434
Name: proportion, dtype: float64

## Thoughts and hypothesis

- Binary classification
- Very unbalanced! 95% are real vs 5% fake. Need to be careful when we assess quality of models + training the data (should I upsample the fake ones? Or downsample the good ones). 
- Accuracy is not a good measure.
- What could differientiate real vs fake ads? Special words (scam words), length of text, external links, more words in uppercase (so careful if preprocessing and putting all to lowercase)

## To Do

MVP
- Clean dataset: duplicates, null values
- Quick check of fraudulent ads to see if anything obvious (length of text, scam words, ...)
- Check if language is English for all, since we ll use english language to clean
- Create a baseline model before any preprocessing and feature engineering -> save model
- Create API
- Create docker image
- Test API

V2
- Remove common english words, punctuation, ...
- Possible feature engineering
- Train different models (simple to more complex).
- Test not only using accuracy as metrics
- Redeploy best model to API

# Data cleaning

In [191]:
df[df['description'].isnull()]

Unnamed: 0,description,fraudulent
17513,,1


In [192]:
df.dropna(subset=['description'], inplace=True)

In [193]:
df[df.duplicated(subset=['description'], keep=False)].sort_values('description')

Unnamed: 0,description,fraudulent
14118,"""Pride is a personal commitment. It is an att...",0
4193,"""Pride is a personal commitment. It is an att...",0
13528,"""We take great care of our CarePartners so the...",0
12007,"""We take great care of our CarePartners so the...",0
11806,"""We take great care of our CarePartners so the...",0
...,...,...
4520,•Prepares source data for computer entry by co...,0
9992,• Answering incoming calls and securing ord...,0
9196,• Answering incoming calls and securing ord...,0
15441,"￼￼Create, maintain and adjust portfolio of ass...",0


In [194]:
df.drop_duplicates(subset=['description'], inplace=True)

In [195]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 14801 entries, 0 to 17878
Data columns (total 2 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   description  14801 non-null  object
 1   fraudulent   14801 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 346.9+ KB


In [197]:
df['fraudulent'].value_counts()

fraudulent
0    14170
1      631
Name: count, dtype: int64

In [198]:
df['fraudulent'].value_counts(normalize=True)

fraudulent
0    0.957368
1    0.042632
Name: proportion, dtype: float64

# Check of fraudulent ads

In [199]:
fraud_df = df[df['fraudulent'] == 1]
random_row = random.randint(0, len(fraud_df) - 1)
fraud_df.iloc[random_row]['description']

'We are Looking for a person with strong writing skills and demonstrable experience building Microsoft Excel spreadsheets and Microsoft PowerPoint presentations. Must be comfortable interacting with customers and potential customers both on the phone and via e-mail. College Education preferred.Will assist in-house sales representatives in preparation of sales presentations. Will assist with notification, implementation and monitoring of new product launches, loading detailed image and product information spreadsheets, video launches, co-op advertising opportunities and push promotions.Must have strong organizational skills and must be detail oriented. Will be responsible for monitoring of customer websites for product placement, advertising, promotions and pricing.Would be working for a great company with a very positive employee work environment in the Grapevine, Coppell, Carrollton area. Equal Opportunity Employer.'

### Quick notes
- Not all fraudulent look obvious, but some do
- Words used: free time, cash, today, no experience needed, from home, day/daily, extra
- Numbers in them (?) (x$/day)
- URLs?

### Examples

- 'Cash In Hand Job (Urgent Staff Required)No Experience Required And Never Any Fees.Work Anytime 1 To 2 Hrs Daily In Free Time.Earn Easily $400 To $500 Extra Per Day.Totally Free To Join &amp; Suitable For All.Take Action &amp; Get Started Today.Please contact us.'
- We are looking for inbound call representativesWe provide complete training...We pay Daily!$20.00 to $200.00 plus a dayGive us a call TODAY
- 'customer service reps needed asap\xa0'
- 'We have several openings available in this area earning $1000.00-$2500.00 per week.\xa0We are seeking only honest, self-motivated people with a desire to work in the home typing and data entry field, from the comfort of their own homes.The preferred applicants should be at least 18 years old with Internet access. No experience is needed. However the following skills are desirable: \xa0'
- 'Student Positions Part-Time and Full-Time.You can do it all from home, in your free time, at your own place.Spend 30 minutes or 1 hours a day &amp; Get biggest cash.You can work in the morning, afternoon, or at night.Perfect for everyone then start immediately.Can earn $350 to $450 extra per day.No any experience required.Zero start-up fee, Visit here:-#URL_7ebe37f71633be1b80547d6f213cb0075a63b6ced35281bfa5c067b5c685f04c#-rg.info'

In [240]:
real_df = df[df['fraudulent'] == 0]
random_row = random.randint(0, len(real_df) - 1)
real_df.iloc[random_row]['description']

'We are Netguru and we love to develop web application based on Ruby On Rails framework. We value quality, transparent communication and passion for #URL_29ee3c13b00b08139a947ea0c6a2de501300346e0ebc160486275c4253ff93a2# are always on the lookout for passionate Ruby on Rails developers! If you have talent and skills to deliver the best quality - check out our offer.'

In [241]:
real_df['description'].str.split().str.len().mean()

np.float64(178.94015525758644)

In [242]:
real_df['description'].str.split().explode().value_counts().head(10)

description
and     146656
to       80037
the      75876
of       56884
a        52357
in       42808
for      37356
with     33846
is       24234
our      23509
Name: count, dtype: int64

In [243]:
# Check for average number of words
fraud_df['description'].str.split().str.len().mean()

np.float64(164.63708399366087)

In [244]:
# Check for most common words
fraud_df['description'].str.split().explode().value_counts().head(10)

description
and     6348
to      3017
the     3012
of      2153
a       1706
in      1657
for     1488
with    1295
is       824
are      702
Name: count, dtype: int64

# MVP

In [None]:
# Split into features and labels
X = df['description']
y = df['fraudulent']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train logistic regression model
model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(X_train_tfidf, y_train)

# Predict and evaluate
y_pred = model.predict(X_test_tfidf)
report = classification_report(y_test, y_pred, output_dict=True)
pd.DataFrame(report).transpose()

In [247]:
# Save the model
joblib.dump(model, '../models/baseline_model.pkl')

['../models/baseline_model.pkl']

# V2

In [222]:
def preprocess_text(text: str) -> str:
    text = str(text).lower()
    text = text.replace('.', '. ')
    text = text.replace('no experience', 'zero experience')
    doc = nlp(text)
    cleaned_tokens = [
        token.lemma_.lower() 
        for token in doc 
        if not token.is_stop and not token.is_punct and token.is_alpha
    ]
    cleaned_text = ' '.join(cleaned_tokens)
    
    return cleaned_text

In [223]:
fraud_df['description_cleaned'] = fraud_df['description'].apply(preprocess_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  fraud_df['description_cleaned'] = fraud_df['description'].apply(preprocess_text)


In [224]:
fraud_df.head()

Unnamed: 0,description,fraudulent,description_cleaned
98,"IC&amp;E Technician | Bakersfield, CA Mt. Poso...",1,technician bakersfield mt posoprincipal duty r...
144,The group has raised a fund for the purchase o...,1,group raise fund purchase home southeast stude...
173,Technician Instrument &amp; ControlsLocation D...,1,technician instrument amp controlslocation dew...
180,Sales Executive,1,sale executive
215,"IC&amp;E Technician | Bakersfield, CA Mt. Poso...",1,technician bakersfield mt posoprincipal duty r...


In [225]:
fraud_df['description'].str.split().explode().value_counts().head(10)

description
and     6348
to      3017
the     3012
of      2153
a       1706
in      1657
for     1488
with    1295
is       824
are      702
Name: count, dtype: int64

In [226]:
fraud_df['description_cleaned'].str.split().explode().value_counts().head(10)

description_cleaned
work          809
service       584
customer      520
project       500
product       457
team          453
experience    421
company       409
provide       403
position      402
Name: count, dtype: int64

In [None]:
real_df['description_cleaned'] = real_df['description'].apply(preprocess_text)
real_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_df['description_cleaned'] = real_df['description'].apply(preprocess_text)


In [229]:
real_df['description_cleaned'].str.split().explode().value_counts().head(10)

description_cleaned
team          18677
work          18446
customer      12127
service       10431
company       10052
business      10041
product        9960
client         9947
experience     9569
new            8521
Name: count, dtype: int64

In [None]:
df_cleaned = pd.concat([fraud_df[['description_cleaned', 'fraudulent']], 
                         real_df[['description_cleaned', 'fraudulent']]], 
                        ignore_index=True)
df_cleaned.head()

fraudulent
0    14170
1      631
Name: count, dtype: int64

In [None]:
X = df_cleaned['description_cleaned']
y = df_cleaned['fraudulent']

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=42)

vectorizer = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Train logistic regression model
model = LogisticRegression(class_weight='balanced', random_state=42)
model.fit(X_train_tfidf, y_train)

y_pred = model.predict(X_test_tfidf)
report = classification_report(y_test, y_pred, output_dict=True)
pd.DataFrame(report).transpose()

Unnamed: 0,precision,recall,f1-score,support
0,0.986018,0.895579,0.938625,4252.0
1,0.233161,0.714286,0.351562,189.0
accuracy,0.887863,0.887863,0.887863,0.887863
macro avg,0.609589,0.804932,0.645094,4441.0
weighted avg,0.953978,0.887863,0.91364,4441.0
