
## Requires

- panda
- nltk
- textblob

In [6]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from textblob import TextBlob
import time

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /Users/mpaz/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/mpaz/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Full implementation starts here

In [7]:
def extract_keywords(text):
    words = word_tokenize(str(text).lower())
    stop_words = set(stopwords.words('english'))
    keywords = [word for word in words if word.isalnum() and word not in stop_words]
    return ', '.join(keywords[:5])

def get_sentiment(text):
    blob = TextBlob(str(text))
    sentiment = blob.sentiment.polarity
    if sentiment > 0:
        return 'Positive'
    elif sentiment < 0:
        return 'Negative'
    else:
        return 'Neutral'

def process_dataframe(df):
    processed_df = df[['Document ID', 'Tracking Number', 'Posted Date', 'Comment']].copy()
    processed_df['Keywords'] = processed_df['Comment'].apply(extract_keywords)
    processed_df['Sentiment'] = processed_df['Comment'].apply(get_sentiment)
    processed_df['Sentiment_Score'] = processed_df['Comment'].apply(lambda x: TextBlob(str(x)).sentiment.polarity)
    return processed_df

def analyze_sentiment(df):
    sentiment_distribution = df['Sentiment'].value_counts(normalize=True) * 100
    average_sentiment = df['Sentiment_Score'].mean()
    return sentiment_distribution, average_sentiment

def get_df_size_info(dataframe):
    memory_usage = dataframe.memory_usage(deep=True).sum()
    return {
        'Shape': dataframe.shape,
        'Memory usage (MB)': memory_usage / 1e6,
        'Number of columns': len(dataframe.columns)
    }

def compare_dataframes(original_df, processed_df):
    original_size = original_df.memory_usage(deep=True).sum()
    processed_size = processed_df.memory_usage(deep=True).sum()
    size_difference = original_size - processed_size
    size_difference_percentage = (size_difference / original_size) * 100
    return {
        'Original size (MB)': original_size / 1e6,
        'Processed size (MB)': processed_size / 1e6,
        'Difference (MB)': size_difference / 1e6,
        'Difference (%)': size_difference_percentage
    }

def profile_function(func, *args, **kwargs):
    start_time = time.time()
    result = func(*args, **kwargs)
    end_time = time.time()
    execution_time = end_time - start_time
    print(f"{func.__name__} execution time: {execution_time:.4f} seconds")
    return result

In [8]:
uploaded_file = "../data/lym-yniq-kanh.csv"
df = pd.read_csv(uploaded_file)

In [9]:
print(df)
df.head()

              Document ID Agency ID      Docket ID Tracking Number  \
0     DEA-2024-0059-27369       DEA  DEA-2024-0059   lya-hq0l-x1cq   
1     DEA-2024-0059-27370       DEA  DEA-2024-0059   lyc-6a7o-dd95   
2     DEA-2024-0059-27371       DEA  DEA-2024-0059   lyc-5a9y-trso   
3     DEA-2024-0059-27372       DEA  DEA-2024-0059   lyc-5912-vi0z   
4     DEA-2024-0059-27373       DEA  DEA-2024-0059   lyc-576r-6ez4   
...                   ...       ...            ...             ...   
2201  DEA-2024-0059-29570       DEA  DEA-2024-0059   lyj-6vb1-9dc9   
2202  DEA-2024-0059-29571       DEA  DEA-2024-0059   lyj-729r-mdda   
2203  DEA-2024-0059-29572       DEA  DEA-2024-0059   lyk-7qhc-gi81   
2204  DEA-2024-0059-29573       DEA  DEA-2024-0059   lyk-7s78-tclt   
2205  DEA-2024-0059-29574       DEA  DEA-2024-0059   lyk-8avb-2js6   

          Document Type        Posted Date  Is Withdrawn?  \
0     Public Submission  2024-07-07T04:00Z          False   
1     Public Submission  2024-07-07T0

Unnamed: 0,Document ID,Agency ID,Docket ID,Tracking Number,Document Type,Posted Date,Is Withdrawn?,Federal Register Number,FR Citation,Title,...,Government Agency,Government Agency Type,Comment,Category,Restrict Reason Type,Restrict Reason,Reason Withdrawn,Content Files,Attachment Files,"Display Properties (Name, Label, Tooltip)"
0,DEA-2024-0059-27369,DEA,DEA-2024-0059,lya-hq0l-x1cq,Public Submission,2024-07-07T04:00Z,False,,,Comment on FR Doc # 2024-11137,...,,,Any educated person can understand how absurd ...,,,,,,,
1,DEA-2024-0059-27370,DEA,DEA-2024-0059,lyc-6a7o-dd95,Public Submission,2024-07-07T04:00Z,False,,,Comment on FR Doc # 2024-11137,...,,,Marijuana should be removed completely from th...,,,,,,,
2,DEA-2024-0059-27371,DEA,DEA-2024-0059,lyc-5a9y-trso,Public Submission,2024-07-07T04:00Z,False,,,Comment on FR Doc # 2024-11137,...,,,Docket No. DEA-1362\n\nI am in favor of moving...,,,,,,,
3,DEA-2024-0059-27372,DEA,DEA-2024-0059,lyc-5912-vi0z,Public Submission,2024-07-07T04:00Z,False,,,Comment on FR Doc # 2024-11137,...,,,Marijuana should be deregulated as an adult it...,,,,,,,
4,DEA-2024-0059-27373,DEA,DEA-2024-0059,lyc-576r-6ez4,Public Submission,2024-07-07T04:00Z,False,,,Comment on FR Doc # 2024-11137,...,,,\nI think marijuana should be completely de sc...,,,,,,,


In [10]:
# Main execution
# Assuming you've already loaded your CSV file into a DataFrame called 'df'

# Profile each function
processed_df = profile_function(process_dataframe, df)
sentiment_dist, avg_sentiment = profile_function(analyze_sentiment, processed_df)
original_size_info = profile_function(get_df_size_info, df)
processed_size_info = profile_function(get_df_size_info, processed_df)
size_comparison = profile_function(compare_dataframes, df, processed_df)

# Print results
print("\nProcessed DataFrame:")
print(processed_df.head())

print("\nSentiment Analysis:")
print(f"Distribution:\n{sentiment_dist}")
print(f"Average Sentiment Score: {avg_sentiment:.2f}")

print("\nDataFrame Size Information:")
print("Original DataFrame:", original_size_info)
print("Processed DataFrame:", processed_size_info)

print("\nSize Comparison:")
print(size_comparison)

process_dataframe execution time: 28.8517 seconds
analyze_sentiment execution time: 0.0013 seconds
get_df_size_info execution time: 0.0160 seconds
get_df_size_info execution time: 0.0056 seconds
compare_dataframes execution time: 0.0175 seconds

Processed DataFrame:
           Document ID Tracking Number        Posted Date  \
0  DEA-2024-0059-27369   lya-hq0l-x1cq  2024-07-07T04:00Z   
1  DEA-2024-0059-27370   lyc-6a7o-dd95  2024-07-07T04:00Z   
2  DEA-2024-0059-27371   lyc-5a9y-trso  2024-07-07T04:00Z   
3  DEA-2024-0059-27372   lyc-5912-vi0z  2024-07-07T04:00Z   
4  DEA-2024-0059-27373   lyc-576r-6ez4  2024-07-07T04:00Z   

                                             Comment  \
0  Any educated person can understand how absurd ...   
1  Marijuana should be removed completely from th...   
2  Docket No. DEA-1362\n\nI am in favor of moving...   
3  Marijuana should be deregulated as an adult it...   
4  \nI think marijuana should be completely de sc...   

                             