In [70]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.svm import LinearSVC

In [36]:
data = {
    'text1': ['hello world', 'good morning', 'how are you'],
    'text2': ['foo bar', 'lorem ipsum', 'blah blah'],
    'target': [1, 0, 1]  # Binary target for classification
}
df = pd.DataFrame(data)

In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text1   3 non-null      object
 1   text2   3 non-null      object
 2   target  3 non-null      int64 
dtypes: int64(1), object(2)
memory usage: 204.0+ bytes


In [59]:
df = pd.read_csv("output/prepared-data.csv").rename(columns={
    "text": "text1",
    "path": "text2",
    "is_story": "target"
})

In [61]:
df.fillna("", inplace=True)

In [62]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2846 entries, 0 to 2845
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text1   2846 non-null   object
 1   text2   2846 non-null   object
 2   target  2846 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 66.8+ KB


In [63]:
x1_col = df.columns[0]
x2_col = df.columns[1]
y_col = df.columns[2]

In [71]:
# Define vectorizer for text columns
vectorizer = CountVectorizer(min_df=0.1, max_df=0.9, ngram_range=(1, 8), analyzer="char")

# Create column transformer for text columns
preprocessor = ColumnTransformer(
    transformers=[
        (x1_col, vectorizer, x1_col),
        (x2_col, vectorizer, x2_col)
    ],
    remainder='drop'  # Drop any columns not specified
)

# Create pipeline with column transformer and model
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LinearSVC())
])

In [72]:
# Split data into features (X) and target (y)
X = df[[x1_col, x2_col]]
y = df[y_col]

In [73]:
X

Unnamed: 0,text1,text2
0,Trouble in Kenya's Flower Fields,/2017/12/trouble-in/
1,Asylum for Sale Refugees Say Some U.N. Workers...,/2019/04/unhcr-corruption-refugee-resettlement/
2,Documentaries as AdvertisingCorporate Interest...,/2019/12/documentaries-as-advertising/
3,Pandemic Drives Wave of Property Grabs in Zambia,/2020/12/propertygrabs/
4,Did Industry Funding Influence an FDA Investig...,/2022/07/did-industry-funding-influence-an-fda...
...,...,...
2841,MYSTERY: Girl With Brain Condition 'Cured Self...,/evie-mae-geurts-miracle-hydrocephalus/
2842,STUDY: Octopus Brain More Similar To Human Bra...,/human-octopus-brain-jumping-genes/
2843,Most Long-COVID Sufferers Battle Neurological ...,/long-covid-brain-neurological-symptoms/
2844,Classical Music Relieves Pain?,/music-pain-relief-classical/


In [74]:
y

0       1
1       1
2       1
3       1
4       1
       ..
2841    1
2842    1
2843    1
2844    1
2845    1
Name: target, Length: 2846, dtype: int64

In [75]:
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [76]:
# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

In [77]:
# Make predictions on the test data
y_pred = pipeline.predict(X_test)

In [78]:
# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

Accuracy: 0.9298245614035088
