In [195]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.cluster import KMeans

In [196]:
data = pd.read_excel(r'/Users/student/Desktop/2020102_final/chatgpt1.xlsx')
data['Text'] = data['Text'].astype(str)
data = data[data['Language'] == 'en']

In [197]:
# Define a function to clean the text
def clean_text(text):  
    # Remove special characters and numbers
    text = re.sub(r'[^A-Za-z@#]', ' ', text)
    
    # Convert to lowercase
    text = text.lower()
    
    # Tokenize the text
    tokens = word_tokenize(text)
    
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words]
    
    # Join tokens back into a string
    cleaned_text = ' '.join(tokens)
    
    return cleaned_text

In [198]:
data_en.loc[:, 'cleaned_text'] = data['Text'].apply(clean_text)

In [199]:
limit = 1000
data_en = data_en.head(limit)

In [200]:
# Extract the cleaned text and user labels
tweets = data_en['cleaned_text'].tolist()
user_labels = data_en['Username'].tolist()

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(tweets, user_labels, test_size=0.2, random_state=42)

# Vectorize the tweets using TF-IDF
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

# Train a classifier (e.g., Linear Support Vector Classifier)
classifier = LinearSVC()
classifier.fit(X_train_vectorized, y_train)

# Evaluate the classifier on the test set
accuracy = classifier.score(X_test_vectorized, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.075


In [202]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import DBSCAN

# Combine 'Text' and 'hashtag' columns into a single feature column
data_en['features'] = data_en['Text'] + ' ' + data_en['hashtag'].replace('[\[\]]', '', regex=True)

# Remove rows with empty features
data_en = data_en[data_en['features'].str.strip().astype(bool)]

# Remove rows with missing values in the 'features' column
data_en = data_en.dropna(subset=['features'])

# Apply the clean_text function to the 'features' column
data_en['cleaned_features'] = data_en['features'].apply(clean_text)

# Extract the cleaned features as a list of strings
cleaned_features = data_en['cleaned_features'].tolist()



# Vectorize the cleaned features using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(cleaned_features)

print(X)

# Perform DBSCAN clustering
dbscan = DBSCAN(eps=0.5, min_samples=5)  # Adjust the parameters as needed
cluster_labels = dbscan.fit_predict(X)

# Add cluster labels to the DataFrame
data_en['cluster_label'] = cluster_labels

# Print the cluster labels
print(data_en[['Text', 'hashtag', 'cluster_label']])


  (0, 3838)	0.3059235948162466
  (0, 1008)	0.42444722290938536
  (0, 741)	0.2951988045059948
  (0, 1984)	0.400596350094066
  (0, 49)	0.38367388652300943
  (0, 654)	0.06378708155988883
  (0, 3431)	0.42444722290938536
  (0, 127)	0.38367388652300943
  (1, 223)	0.5652653451644989
  (1, 1379)	0.5652653451644989
  (1, 736)	0.1307074016568685
  (1, 2003)	0.13086025879410806
  (1, 486)	0.5652653451644989
  (1, 654)	0.08494961146838338
  (2, 4652)	0.3781167068221278
  (2, 4542)	0.3781167068221278
  (2, 4385)	0.3781167068221278
  (2, 3336)	0.27253051962081365
  (2, 1809)	0.3781167068221278
  (2, 4451)	0.29929911810336907
  (2, 3004)	0.3205465500501374
  (2, 3775)	0.3781167068221278
  (2, 736)	0.08743265918307742
  (2, 2003)	0.08753490822035115
  (2, 654)	0.05682440575743562
  :	:
  (998, 3761)	0.17987023697263155
  (998, 1469)	0.3597404739452631
  (998, 3798)	0.5092884806352516
  (998, 1382)	0.15702899611767412
  (998, 736)	0.041591743614242054
  (998, 2003)	0.04164038351359805
  (998, 654)	0.02

AttributeError: 'NoneType' object has no attribute 'split'

In [170]:
from sklearn.ensemble import RandomForestRegressor

# Select relevant columns for modeling
selected_columns = ['Text', 'ReplyCount', 'RetweetCount', 'LikeCount']
data_engagement_pred = data_en[selected_columns]

# Remove rows with missing values
data_engagement_pred = data_engagement_pred.dropna()

# Perform text preprocessing on the 'Text' column
vectorizer = TfidfVectorizer()
X_text = vectorizer.fit_transform(data_engagement_pred['Text'])

# Create the input features by concatenating the preprocessed 'Text' column with other features
X_other = data_engagement_pred.drop(['Text', 'ReplyCount', 'RetweetCount', 'LikeCount'], axis=1)
X = pd.concat([pd.DataFrame(X_text.toarray()), X_other], axis=1)

# Split the data into training and testing sets
y = data_engagement_pred[['ReplyCount', 'RetweetCount', 'LikeCount']]  # Target variables
X_train, X_test, y_train, y_test = train_test_split(X[:1000], y, test_size=0.2, random_state=42)

# Train the Random Forest model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = rf.predict(X_test)

# Print the predicted engagement metrics
print(y_pred)

[[6.00000000e-02 3.00000000e-02 4.30000000e-01]
 [4.70000000e-01 2.40000000e-01 2.31000000e+00]
 [2.85166667e-01 1.40000000e-01 7.70000000e-01]
 [6.20700000e+01 2.18900000e+01 1.13890000e+02]
 [2.50000000e-01 2.00000000e-02 3.80000000e-01]
 [4.40000000e-01 1.10000000e-01 2.34000000e+00]
 [1.10000000e-01 8.00000000e-02 8.20000000e-01]
 [8.00000000e-01 3.90000000e-01 2.11000000e+00]
 [8.00000000e-02 0.00000000e+00 4.00000000e-02]
 [3.00000000e-02 2.00000000e-02 1.60000000e-01]
 [1.30000000e-01 5.00000000e-02 1.80000000e-01]
 [1.60000000e-01 1.00000000e-02 1.07000000e+00]
 [1.00000000e-01 1.10000000e-01 5.50000000e-01]
 [0.00000000e+00 0.00000000e+00 1.00000000e-02]
 [9.00000000e-02 9.00000000e-02 1.10000000e+00]
 [1.60000000e-01 5.00000000e-02 5.40000000e-01]
 [7.00000000e-02 4.00000000e-02 2.10000000e-01]
 [7.30000000e-01 9.80000000e-01 3.70000000e+00]
 [7.33333333e-02 6.00000000e-02 2.60000000e-01]
 [2.70000000e-01 2.00000000e-02 1.26000000e+00]
 [2.30000000e-01 9.00000000e-02 4.100000

In [191]:
from sklearn.naive_bayes import MultinomialNB

# Remove rows with missing or null values in the 'Text' and 'Category' columns
data_en = data_en.dropna(subset=['Text', 'hastag_counts'])

# Extract hashtags from cleaned text column
hashtags = []
for tweet in data_en['Text']:
    extracted_hashtags = re.findall(r'#\w+', tweet)
    if extracted_hashtags:
        hashtags.append(' '.join(extracted_hashtags))

# Create term-frequency matrix
vectorizer = CountVectorizer()
term_freq_matrix = vectorizer.fit_transform(hashtags)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(term_freq_matrix, data_en['hastag_counts'].head(288), test_size=0.2, random_state=42)

# Train the Naive Bayes classifier
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, y_train)

# Make predictions on the testing set
y_pred = naive_bayes.predict(X_test)

# Print the predicted categories
print(y_pred)


[0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0 0]


In [203]:
pip install --upgrade scikit-learn

Note: you may need to restart the kernel to use updated packages.
