In [4]:
import pandas as pd

# Load the social media posts from the CSV file
df = pd.read_csv('/content/social media.csv')

# Display the first few rows of the dataframe
print(df.head())


     post_id     user_id  username    location  followers_count  \
101        1    john_doe  New York         USA              500   
102        2  jane_smith    London          UK             1200   
103        3  mike_brown   Toronto      Canada              900   
104        4  alice_wong    Sydney   Australia             1500   
105        5     lucy_li   Beijing       China              700   

    profile_creation_date                                  post_content  \
101            2022-01-15               Loving the new product release!   
102            2020-06-25        This service is terrible, never again.   
103            2021-05-10  The update feels just okay, nothing special.   
104            2019-11-20        The best app I’ve used in a long time!   
105            2023-03-05   I am really disappointed with this service.   

      post_date   platform  sentiment_score_id sentiment_type  \
101  2024-09-18    Twitter                   1       positive   
102  2024-09-19 

1.DATA PREPROCCESSING

In [6]:
import re
import string
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer

# Load the dataset
df = pd.read_csv('/content/social media.csv')

# Step 1: Handle Missing or Null Values
df.fillna('', inplace=True)

# Step 2: Convert Dates to Datetime Format
df['post_date'] = pd.to_datetime(df['post_date'], format='%Y-%m-%d')
df['profile_creation_date'] = pd.to_datetime(df['profile_creation_date'], format='%Y-%m-%d')

# Step 3: Normalize Text Data
def clean_text(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    return text

df['post_content_cleaned'] = df['post_content'].apply(clean_text)

# Step 4: Handle Categorical Features
df = pd.get_dummies(df, columns=['platform', 'sentiment_type'], drop_first=True)

# Step 5: Feature Scaling
scaler = MinMaxScaler()
df[['followers_count', 'confidence_score']] = scaler.fit_transform(df[['followers_count', 'confidence_score']])

# Step 6: Tokenization and Text Vectorization
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['post_content_cleaned'])
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Combine the vectorized text data with the original DataFrame
df = pd.concat([df, X_df], axis=1)

# Step 7: Save Preprocessed Data
df.to_csv('Preprocessed_SocialMediaData.csv', index=False)

# Display the first few rows of the preprocessed DataFrame
print(df.head())


     post_id     user_id  username    location  followers_count  \
101      1.0    john_doe  New York         USA         0.038462   
102      2.0  jane_smith    London          UK         0.307692   
103      3.0  mike_brown   Toronto      Canada         0.192308   
104      4.0  alice_wong    Sydney   Australia         0.423077   
105      5.0     lucy_li   Beijing       China         0.115385   

    profile_creation_date                                  post_content  \
101            2022-01-15               Loving the new product release!   
102            2020-06-25        This service is terrible, never again.   
103            2021-05-10  The update feels just okay, nothing special.   
104            2019-11-20        The best app I’ve used in a long time!   
105            2023-03-05   I am really disappointed with this service.   

     post_date  sentiment_score_id  confidence_score  ... this time  to  \
101 2024-09-18                 1.0          0.714286  ...  NaN  NaN NaN

2.SENTIMENT ANALYSIS

In [8]:
import pandas as pd
from textblob import TextBlob

# Load the preprocessed dataset
df = pd.read_csv('Preprocessed_SocialMediaData.csv')

# Ensure all values in post_content_cleaned are strings
df['post_content_cleaned'] = df['post_content_cleaned'].astype(str)

# Function to analyze sentiment using TextBlob
def analyze_sentiment(text):
    analysis = TextBlob(text)
    return analysis.sentiment.polarity

# Apply the sentiment analysis function to the cleaned post content
df['sentiment_score'] = df['post_content_cleaned'].apply(analyze_sentiment)

# Classify sentiment based on the polarity score
def classify_sentiment(score):
    if score > 0:
        return 'positive'
    elif score < 0:
        return 'negative'
    else:
        return 'neutral'

df['sentiment_class'] = df['sentiment_score'].apply(classify_sentiment)

# Display the updated DataFrame with sentiment scores
print(df[['post_content', 'sentiment_score', 'sentiment_class']].head())

# Save the DataFrame with sentiment scores to a new CSV file
df.to_csv('SocialMediaData_WithSentiment.csv', index=False)


                                   post_content  sentiment_score  \
0               Loving the new product release!         0.368182   
1        This service is terrible, never again.        -1.000000   
2  The update feels just okay, nothing special.         0.428571   
3        The best app I’ve used in a long time!         0.475000   
4   I am really disappointed with this service.        -0.750000   

  sentiment_class  
0        positive  
1        negative  
2        positive  
3        positive  
4        negative  


3.FEATURE ENGINEERING

In [10]:
import pandas as pd
import re

# Load the dataset
df = pd.read_csv('Preprocessed_SocialMediaData.csv')

# Ensure all values in post_content are strings
df['post_content'] = df['post_content'].astype(str)

# Function to extract hashtags
def extract_hashtags(text):
    hashtags = re.findall(r'#\w+', text)
    return ', '.join(hashtags)  # Join multiple hashtags into a single string

# Function to extract keywords (e.g., words with more than 4 characters)
def extract_keywords(text):
    words = re.findall(r'\w+', text)
    keywords = [word for word in words if len(word) > 4]  # Customize the length as needed
    return ', '.join(keywords)

# Apply the functions to extract hashtags and keywords
df['hashtags'] = df['post_content'].apply(extract_hashtags)
df['keywords'] = df['post_content'].apply(extract_keywords)

# Optionally: Count the number of hashtags and keywords
df['hashtag_count'] = df['hashtags'].apply(lambda x: len(x.split(', ')) if x else 0)
df['keyword_count'] = df['keywords'].apply(lambda x: len(x.split(', ')) if x else 0)

# Display the updated DataFrame with new features
print(df[['post_content', 'hashtags', 'keywords', 'hashtag_count', 'keyword_count']].head())

# Save the DataFrame with new features to a new CSV file
df.to_csv('SocialMediaData_WithFeatures.csv', index=False)


                                   post_content hashtags  \
0               Loving the new product release!            
1        This service is terrible, never again.            
2  The update feels just okay, nothing special.            
3        The best app I’ve used in a long time!            
4   I am really disappointed with this service.            

                          keywords  hashtag_count  keyword_count  
0         Loving, product, release              0              3  
1  service, terrible, never, again              0              4  
2  update, feels, nothing, special              0              4  
3                                               0              0  
4    really, disappointed, service              0              3  
