In [14]:
import pandas as pd

# Load review datasets with specified encoding
try:
    dfr1 = pd.read_csv('/content/enwiki-20170820.csv', encoding='latin-1')
except pd.errors.ParserError:
    print("Error: Unable to parse the CSV file.")
    dfr1 = None

# Check if data was successfully loaded
if dfr1 is not None:
    # Display the first few rows of the original dataset
    print("First few rows of the original dataset:")
    print(dfr1.head())

    # Select only ARTICLE_ID and SECTION_TEXT columns
    dfr1_subset = dfr1[['ARTICLE_ID', 'SECTION_TEXT']]

    # Display the first few rows of the subset
    print("\nSubset of the dataset (ARTICLE_ID and SECTION_TEXT columns only):")
    print(dfr1_subset.head())

    # Check for missing values
    print("\nDFR1 NULL:")
    display(dfr1.isnull().sum())

    # Drop unnecessary columns
    dfr1.drop(columns=['TITLE', 'SECTION_TITLE'], inplace=True)

    # Display the first two rows and data information after dropping columns
    print("\nDFR1 (First two rows after dropping columns):")
    display(dfr1.head(2))
    print("\nDFR1 (Data information after dropping columns):")
    display(dfr1.info())
else:
    print("No data loaded due to parsing error.")


First few rows of the original dataset:
                                          ARTICLE_ID      TITLE  \
0                                                  0  Anarchism   
1                                                  0  Anarchism   
2                                                  0  Anarchism   
3  s by indigenous feminist ecological and cultur...        NaN   
4                                                NaN        NaN   

               SECTION_TITLE  \
0               Introduction   
1  Etymology and terminology   
2                    History   
3                        NaN   
4                        NaN   

                                        SECTION_TEXT  
0  \n\n\n\n\n\n'''Anarchism''' is a political phi...  
1  \n\nThe term ''anarchism'' is a compound word ...  
2  \n\n===Origins===\nWoodcut from a Diggers docu...  
3                                                NaN  
4                                                NaN  

Subset of the dataset (ARTICLE_ID

ARTICLE_ID       465339
TITLE            973962
SECTION_TITLE    973988
SECTION_TEXT     974005
dtype: int64


DFR1 (First two rows after dropping columns):


Unnamed: 0,ARTICLE_ID,SECTION_TEXT
0,0,\n\n\n\n\n\n'''Anarchism''' is a political phi...
1,0,\n\nThe term ''anarchism'' is a compound word ...



DFR1 (Data information after dropping columns):
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1016621 entries, 0 to 1016620
Data columns (total 2 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   ARTICLE_ID    551282 non-null  object
 1   SECTION_TEXT  42616 non-null   object
dtypes: object(2)
memory usage: 15.5+ MB


None

In [15]:
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import nltk

# Load review dataset with specified encoding
try:
    dfr1 = pd.read_csv('/content/enwiki-20170820.csv', encoding='latin-1')
except pd.errors.ParserError:
    print("Error: Unable to parse the CSV file.")
    dfr1 = None

# Check if data was successfully loaded
if dfr1 is not None:
    # Drop rows with missing values
    dfr1.dropna(inplace=True)

    # Tokenize the review content
    dfr1['SECTION_TEXT'] = dfr1['SECTION_TEXT'].apply(lambda x: word_tokenize(str(x)))

    # Remove non-alphanumeric characters from review content
    dfr1['SECTION_TEXT'] = dfr1['SECTION_TEXT'].apply(lambda x: [re.sub(r'\W', ' ', word) for word in x])

    # Save the preprocessed data to a separate file
    output_file_path = '/content/preprocessed_data.csv'
    dfr1.to_csv(output_file_path, index=False)
    print(f"Preprocessed data saved to {output_file_path}")

else:
    print("No data loaded due to parsing error.")


Preprocessed data saved to /content/preprocessed_data.csv


In [18]:
import pandas as pd
import re
import concurrent.futures
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

def load_data(file_path):
    try:
        return pd.read_csv(file_path)
    except pd.errors.ParserError:
        print(f"Error: Unable to parse the CSV file at {file_path}.")
        return None

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]

    # Stemming
    stemmer = PorterStemmer()
    stemmed_words = [stemmer.stem(word) for word in filtered_tokens]

    # Lemmatizing
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in stemmed_words]

    processed_text = ' '.join(lemmatized_words)
    return processed_text

def remove_special_characters(text):
    # Remove all non-alphanumeric characters
    return re.sub(r'[^A-Za-z0-9\s]', '', text)

def parallel_preprocess(data):
    with concurrent.futures.ThreadPoolExecutor(max_workers=6) as executor:
        processed_data = list(executor.map(preprocess_text, data))
    return processed_data

# Load data
file_path = '/content/preprocessed_data.csv'
sample_df = load_data(file_path)

if sample_df is not None:
    # Drop null rows
    sample_df.dropna(inplace=True)

    # Drop unnecessary columns
    sample_df.drop(columns=['TITLE', 'SECTION_TITLE'], inplace=True)

    # Preprocess the 'SECTION_TEXT' column
    sample_df['SECTION_TEXT'] = sample_df['SECTION_TEXT'].apply(preprocess_text)

    # Remove commas and special characters
    sample_df['SECTION_TEXT'] = sample_df['SECTION_TEXT'].apply(remove_special_characters)

    # Display the first rows of the dataset
    print("First rows of the dataset:")
    print(sample_df.head(10))

    # Save the cleaned and preprocessed data to a new CSV file
    cleaned_preprocessed_data_path = '/content/cleaned_preprocessed_data.csv'
    sample_df.to_csv(cleaned_preprocessed_data_path, index=False)
    print(f"Cleaned and preprocessed data saved to {cleaned_preprocessed_data_path}")
else:
    print("No data loaded due to parsing error.")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


First rows of the dataset:
   ARTICLE_ID                                       SECTION_TEXT
0           0       anarch         i      polit   philosophi ...
1           0   the   term      anarch      i      compound  ...
2           0    origin   woodcut   from      digger   docume...
3          21       agricultur   scienc         i      broad ...
4          21   the   three   term   are   often   confus    ...
5          21   one   of   the   most   common   yield   redu...
6          21   with   the   except   of   theoret   agronomi...
7          21   agricultur   scienc   began   with   gregor  ...
8          21   norman   borlaug   father   of   the   green ...
9          21   agricultur   scienc   seek   to   feed   the ...
Cleaned and preprocessed data saved to /content/cleaned_preprocessed_data.csv


In [21]:
import pandas as pd
import re

def remove_non_alphanumeric(text):
    """
    Function to remove non-alphanumeric characters from text using regular expressions.
    """
    # Regular expression to match any non-alphanumeric character
    pattern = r'\d+'

    # Replace non-alphanumeric characters with an empty string
    text_without_non_alphanumeric = re.sub(pattern, '', str(text))

    return text_without_non_alphanumeric

# Load cleaned CSV data
cleaned_data_path = '/content/cleaned_preprocessed_data.csv'
cleaned_df = pd.read_csv(cleaned_data_path)

# Apply remove_non_alphanumeric function to the 'SECTION_TEXT' column
cleaned_df['SECTION_TEXT'] = cleaned_df['SECTION_TEXT'].apply(remove_non_alphanumeric)

# Save the updated data to a new CSV file
updated_data_path = '/content/updated_cleaned_preprocessed_data.csv'
cleaned_df.to_csv(updated_data_path, index=False)
print(f"Updated cleaned data saved to {updated_data_path}")


Updated cleaned data saved to /content/updated_cleaned_preprocessed_data.csv
