-
Notifications
You must be signed in to change notification settings - Fork 0
/
preprocess_cnn.py
82 lines (67 loc) · 3.26 KB
/
preprocess_cnn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
# Download NLTK resources (run once)
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
def load_data(file_path):
# Load dataset from a CSV file
return pd.read_csv(file_path)
def clean_missing_values(data):
# Drop rows with missing values in 'TITLE' or 'CATEGORY' columns
data = data.dropna(subset=['TITLE', 'CATEGORY'])
return data
def normalize_text(text):
# Convert text to string and normalize it
text = str(text).lower() # Ensure text is converted to string and lowercase it
text = re.sub(r'\W+', ' ', text) # Replace non-word characters with spaces
text = text.strip() # Remove leading and trailing whitespace
return text
def lemmatize_text(text):
# Lemmatize the text
lemmatizer = WordNetLemmatizer()
tokens = nltk.word_tokenize(text) # Tokenize the text
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens] # Lemmatize each token
return ' '.join(lemmatized_tokens) # Join tokens back into a single string
def extract_url_features(data):
# Function to extract keywords from URL
def extract_keywords(url):
url = re.sub(r'https?://', '', url) # Remove http or https protocol
url = re.sub(r'www\.', '', url) # Remove www
url = re.sub(r'\.[a-z]{2,3}/.*', '', url) # Remove domain suffix and everything after it
keywords = re.split(r'\W+', url) # Split URL into keywords
return " ".join(keywords)
# Apply keyword extraction to the 'URL' column
data['URL_KEYWORDS'] = data['URL'].apply(extract_keywords)
return data
def preprocess_data(input_file_path, output_file_path):
# Load and preprocess data
data = load_data(input_file_path)
data = clean_missing_values(data)
data['TITLE'] = data['TITLE'].apply(normalize_text) # Normalize text in 'TITLE' column
# Define custom stopwords and combine with NLTK stopwords
custom_stopwords = set(['new', 'u', 'to', 'the', 'in', 'and', 'of', 'a', 'for', 'on', 'with', 'at', 'is', 'that', 'it', 'this'])
all_stopwords = set(stopwords.words('english')) | custom_stopwords
# Function to preprocess the title
def preprocess_title(title):
words = nltk.word_tokenize(title) # Tokenize the title
filtered_words = [word for word in words if word.lower() not in all_stopwords] # Remove stopwords
lemmatizer = WordNetLemmatizer()
lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words] # Lemmatize words
return ' '.join(lemmatized_words) # Join words back into a single string
# Apply title preprocessing to the 'TITLE' column
data['TITLE'] = data['TITLE'].apply(preprocess_title)
# Extract URL features
data = extract_url_features(data)
# Save preprocessed data to a new CSV file
data.to_csv(output_file_path, index=False)
print(f"Preprocessing completed. Preprocessed data saved to {output_file_path}")
if __name__ == '__main__':
# File paths for input and output data
input_file_path = './uci-news-aggregator_small.csv'
output_file_path = './preprocessed_data.csv'
# Preprocess the data
preprocess_data(input_file_path, output_file_path)