<a href="https://colab.research.google.com/github/olinyoder2534/AIWritingDetector/blob/main/AIWritingDetectorCleanData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
#import warnings
#warnings.filterwarnings('ignore')

## Data Exploration

In [None]:
#data is already split into train-test splits
train = pd.read_csv('/content/final_train[1].csv')
test = pd.read_csv('/content/final_test[1].csv')

In [None]:
train.head()

Unnamed: 0,text,label
0,We should keep the Electoral College for a num...,0
1,More and more money is spent on building theat...,1
2,Limiting car usage can actually be effective b...,0
3,"Dear Mrs. Smith,\n\nI am writing to you today ...",1
4,"Dear Principal,\n\nAfter school or during scho...",0


In [None]:
test.head()

Unnamed: 0,text,label
0,The Face on Mars is nothing but a natural occu...,0
1,Students have a higher chance of catching a vi...,0
2,Driverless cars have good and bad things that ...,0
3,Some people might think that traveling in a gr...,1
4,How many of us students want to be forced to d...,0


In [None]:
print(train.shape)
print(test.shape)

(346977, 2)
(86587, 2)


In [None]:
print(train.label.value_counts())
print(test.label.value_counts())

label
0    222154
1    124823
Name: count, dtype: int64
label
0    55845
1    30742
Name: count, dtype: int64


## Preprocessing

### Downsampling/rebalancing Data

In [None]:
from sklearn.utils import resample

In [None]:
#downsampling to save on computation time and rebalancing the data while doing so
train0 = train[train.label == 0]
train1 = train[train.label == 1]

train0 = resample(train0,
                                   replace=False,
                                   n_samples=20000,
                                   random_state=123)

train1 = resample(train1,
                                 replace=False,
                                 n_samples=20000,
                                 random_state=123)

trainNew = pd.concat([train0, train1])
trainNew.head()

Unnamed: 0,text,label
112169,Are there any advantages on limiting car usage...,0
157967,"Dear The Florida State Senator,\n\nIn our nati...",0
128700,The community is very important that is why th...,0
79172,Is reading expressions by computers good\n\nEv...,0
177916,Who doesn't have a car now days? It seems life...,0


In [None]:
print(trainNew.label.value_counts())

label
0    20000
1    20000
Name: count, dtype: int64


In [None]:
#downsampling to save on computation time and rebalancing the data while doing so
test0 = test[test.label == 0]
test1 = test[test.label == 1]

test0 = resample(test0,
                                   replace=False,
                                   n_samples=5000,
                                   random_state=123)

test1 = resample(test1,
                                 replace=False,
                                 n_samples=5000,
                                 random_state=123)

testNew = pd.concat([test0, test1])
testNew.head()

Unnamed: 0,text,label
35520,"Dear me. Senator, I am fed up with the elector...",0
36049,School systems are going above and beyond with...,0
6858,This process doesn't need to make a huge chang...,0
65181,"Dear Principal,\n\nI enjoy watching and partic...",0
85990,The electoral college system has been part of ...,0


In [None]:
print(testNew.label.value_counts())

label
0    5000
1    5000
Name: count, dtype: int64


### Remove PII

In [None]:
import re

In [None]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m896.4 kB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
nlp = spacy.load("en_core_web_lg")

In [None]:
def preprocess(text):
    #lowercase, remove newline
    text = text.replace('\n', ' ').replace('\r', ' ').lower()

    #regex for email, phone, ssn
    email_pattern = r'[\w\.-]+@[\w\.-]+'
    phone_pattern = r'\b(?:\+?(\d{1,3}))?[-.\s(]*(\d{3})[-.\s)]*(\d{3})[-.\s]*(\d{4})(?:\s*x(\d+))?\b'
    ssn_pattern = r'\b\d{3}-\d{2}-\d{4}\b|\bxxx-xx-xxxx\b'

    #remove email, phone, ssn
    text = re.sub(email_pattern, '', text)
    text = re.sub(phone_pattern, '', text)
    text = re.sub(ssn_pattern, '', text)

    #remove names, punctuation, return lemma
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.ent_type_ == "PERSON" or token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

    text = " ".join(filtered_tokens)

    return " ".join(filtered_tokens)

In [None]:
text = "John Doe's email is john.doe@example.com and his phone number is (123) 456-7890. His SSN is 123-45-6789. He likes butterflies and playing in the fields. He works as a manager of a local chinese restaurant."
cleaned_text = preprocess(text)
print(cleaned_text)

email be   and his phone number be his ssn be he like butterfly and play in the field he work as a manager of a local chinese restaurant


In [None]:
trainNew['textNoPII'] = trainNew['text'].apply(preprocess)

In [None]:
#trainNew.head()

In [None]:
from google.colab import files

In [None]:
trainNew.to_csv('trainClean.csv', index = False)
testNew.to_csv('testClean.csv', index = False)

In [None]:
files.download('trainClean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download('testClean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
trainNew.head()
trainNew.tail()

Unnamed: 0,text,label,textNoPII
273492,Their is no single answer to whether or not mo...,1,their be no single answer to whether or not mo...
2118,"Hey there! So, distant learning... MMM... let...",1,hey there so distant learning mmm let I tell...
47872,"As citizens, we must recognize the advantages...",1,as citizen we must recognize the advantage o...
57693,Ey aunt has always been a crucial part of my l...,1,ey aunt have always be a crucial part of my li...
234337,IU is often said UAU young people enjoy life m...,1,iu be often say uau young people enjoy life mo...
