<a href="https://colab.research.google.com/github/olinyoder2534/AIWritingDetector/blob/main/AIWritingDetectorCleanData.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import spacy
#import warnings
#warnings.filterwarnings('ignore')

## Data Exploration

In [4]:
#data is already split into train-test splits
train = pd.read_csv('/content/final_train[1].csv')
test = pd.read_csv('/content/final_test[1].csv')

In [5]:
train.head()

Unnamed: 0,text,label
0,We should keep the Electoral College for a num...,0
1,More and more money is spent on building theat...,1
2,Limiting car usage can actually be effective b...,0
3,"Dear Mrs. Smith,\n\nI am writing to you today ...",1
4,"Dear Principal,\n\nAfter school or during scho...",0


In [6]:
test.head()

Unnamed: 0,text,label
0,The Face on Mars is nothing but a natural occu...,0
1,Students have a higher chance of catching a vi...,0
2,Driverless cars have good and bad things that ...,0
3,Some people might think that traveling in a gr...,1
4,How many of us students want to be forced to d...,0


In [7]:
print(train.shape)
print(test.shape)

(346977, 2)
(86587, 2)


In [8]:
print(train.label.value_counts())
print(test.label.value_counts())

label
0    222154
1    124823
Name: count, dtype: int64
label
0    55845
1    30742
Name: count, dtype: int64


## Preprocessing

### Downsampling/rebalancing Data

In [9]:
from sklearn.utils import resample

In [12]:
#downsampling to save on computation time and rebalancing the data while doing so
train0 = train[train.label == 0]
train1 = train[train.label == 1]

train0 = resample(train0,
                                   replace=False,
                                   n_samples=20000,
                                   random_state=123)

train1 = resample(train1,
                                 replace=False,
                                 n_samples=20000,
                                 random_state=123)

trainNew = pd.concat([train0, train1])
trainNew.head()

Unnamed: 0,text,label
112169,Are there any advantages on limiting car usage...,0
157967,"Dear The Florida State Senator,\n\nIn our nati...",0
128700,The community is very important that is why th...,0
79172,Is reading expressions by computers good\n\nEv...,0
177916,Who doesn't have a car now days? It seems life...,0


In [11]:
print(trainNew.label.value_counts())

label
0    20000
1    20000
Name: count, dtype: int64


In [13]:
#downsampling to save on computation time and rebalancing the data while doing so
test0 = test[test.label == 0]
test1 = test[test.label == 1]

test0 = resample(test0,
                                   replace=False,
                                   n_samples=5000,
                                   random_state=123)

test1 = resample(test1,
                                 replace=False,
                                 n_samples=5000,
                                 random_state=123)

testNew = pd.concat([test0, test1])
testNew.head()

Unnamed: 0,text,label
35520,"Dear me. Senator, I am fed up with the elector...",0
36049,School systems are going above and beyond with...,0
6858,This process doesn't need to make a huge chang...,0
65181,"Dear Principal,\n\nI enjoy watching and partic...",0
85990,The electoral college system has been part of ...,0


In [14]:
print(testNew.label.value_counts())

label
0    5000
1    5000
Name: count, dtype: int64


### Remove PII

In [15]:
import re

In [16]:
!python -m spacy download en_core_web_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [17]:
nlp = spacy.load("en_core_web_lg")

In [18]:
def removePII(text):
    #Regular expression patterns for emails, phone numbers, and social security numbers
    email_pattern = r'[\w\.-]+@[\w\.-]+'
    phone_pattern = r'\b(?:\+?(\d{1,3}))?[-.\s(]*(\d{3})[-.\s)]*(\d{3})[-.\s]*(\d{4})(?:\s*x(\d+))?\b'
    ssn_pattern = r'\b\d{3}-\d{2}-\d{4}\b|\bXXX-XX-XXXX\b'

    #Remove emails, phone numbers, and social security numbers
    text = re.sub(email_pattern, '', text)
    text = re.sub(phone_pattern, '', text)
    text = re.sub(ssn_pattern, '', text)

    doc = nlp(text)
    filtered_tokens = []
    for ent in doc.ents:
        if ent.label_ == "PERSON":
            continue
        filtered_tokens.append(ent.text)

    return " ".join(filtered_tokens)

In [19]:
trainNew['textNoPII'] = trainNew['text'].apply(removePII)

### Clean Strings

In [20]:
#Gonna leave in stop words and not worry about case-sentivity. There's more you could do with this function but for now, I'm just keeping it simple
def clean(text):
    doc = nlp(text)
    filtered_tokens = []
    for token in doc:
        if token.is_punct:
            continue
        filtered_tokens.append(token.lemma_)

In [21]:
trainNew['textClean'] = trainNew['textNoPII'].apply(clean)

In [26]:
from google.colab import files

In [25]:
trainNew.to_csv('trainClean.csv')
testNew.to_csv('testClean.csv')

In [27]:
files.download('trainClean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [28]:
files.download('testClean.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>